bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
  31 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  32 /*
  33  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  65  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  66  */
  67
  68 #include <sys/param.h>
  69 #include <sys/systm.h>
  70 #include <sys/filedesc.h>
  71 #include <sys/proc_internal.h>
  72 #include <sys/kauth.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/fcntl.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <sys/domain.h>
  78 #include <sys/kernel.h>
  79 #include <sys/event.h>
  80 #include <sys/poll.h>
  81 #include <sys/protosw.h>
  82 #include <sys/socket.h>
  83 #include <sys/socketvar.h>
  84 #include <sys/resourcevar.h>
  85 #include <sys/signalvar.h>
  86 #include <sys/sysctl.h>
  87 #include <sys/uio.h>
  88 #include <sys/ev.h>
  89 #include <sys/kdebug.h>
  90 #include <net/route.h>
  91 #include <netinet/in.h>
  92 #include <netinet/in_pcb.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/locks.h>
  95 #include <machine/limits.h>
  96
  97 int                     so_cache_hw = 0;
  98 int                     so_cache_timeouts = 0;
  99 int                     so_cache_max_freed = 0;
 100 int                     cached_sock_count = 0;
 101 struct socket           *socket_cache_head = 0;
 102 struct socket           *socket_cache_tail = 0;
 103 u_long                  so_cache_time = 0;
 104 int                     so_cache_init_done = 0;
 105 struct zone             *so_cache_zone;
 106 extern int              get_inpcb_str_size();
 107 extern int              get_tcp_str_size();
 108
 109 static lck_grp_t                *so_cache_mtx_grp;
 110 static lck_attr_t               *so_cache_mtx_attr;
 111 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 112 lck_mtx_t                               *so_cache_mtx;
 113
 114 #include <machine/limits.h>
 115
 116 static void     filt_sordetach(struct knote *kn);
 117 static int      filt_soread(struct knote *kn, long hint);
 118 static void     filt_sowdetach(struct knote *kn);
 119 static int      filt_sowrite(struct knote *kn, long hint);
 120 static int      filt_solisten(struct knote *kn, long hint);
 121
 122 static struct filterops solisten_filtops =
 123   { 1, NULL, filt_sordetach, filt_solisten };
 124 static struct filterops soread_filtops =
 125   { 1, NULL, filt_sordetach, filt_soread };
 126 static struct filterops sowrite_filtops =
 127   { 1, NULL, filt_sowdetach, filt_sowrite };
 128
 129 #define EVEN_MORE_LOCKING_DEBUG 0
 130 int socket_debug = 0;
 131 int socket_zone = M_SOCKET;
 132 so_gen_t        so_gencnt;      /* generation count for sockets */
 133
 134 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 135 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 136
 137 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 138 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 139 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 140 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 141 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 142 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 143 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 144
 145 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 146
 147
 148 SYSCTL_DECL(_kern_ipc);
 149
 150 static int somaxconn = SOMAXCONN;
 151 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
 152            0, "");
 153
 154 /* Should we get a maximum also ??? */
 155 static int sosendmaxchain = 65536;
 156 static int sosendminchain = 16384;
 157 static int sorecvmincopy  = 16384;
 158 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
 159            0, "");
 160 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
 161            0, "");
 162
 163 void  so_cache_timer();
 164
 165 /*
 166  * Socket operation routines.
 167  * These routines are called by the routines in
 168  * sys_socket.c or from a system process, and
 169  * implement the semantics of socket operations by
 170  * switching out to the protocol specific routines.
 171  */
 172
 173 #ifdef __APPLE__
 174
 175 vm_size_t       so_cache_zone_element_size;
 176
 177 static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid);
 178
 179
 180 void socketinit()
 181 {
 182     vm_size_t   str_size;
 183
 184         if (so_cache_init_done) {
 185                 printf("socketinit: already called...\n");
 186                 return;
 187         }
 188
 189         /*
 190          * allocate lock group attribute and group for socket cache mutex
 191          */
 192         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 193         lck_grp_attr_setdefault(so_cache_mtx_grp_attr);
 194
 195         so_cache_mtx_grp = lck_grp_alloc_init("so_cache", so_cache_mtx_grp_attr);
 196
 197         /*
 198          * allocate the lock attribute for socket cache mutex
 199          */
 200         so_cache_mtx_attr = lck_attr_alloc_init();
 201         lck_attr_setdefault(so_cache_mtx_attr);
 202
 203     so_cache_init_done = 1;
 204
 205     so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);     /* cached sockets mutex */
 206
 207     if (so_cache_mtx == NULL)
 208                 return; /* we're hosed... */
 209
 210     str_size = (vm_size_t)( sizeof(struct socket) + 4 +
 211                             get_inpcb_str_size()  + 4 +
 212                             get_tcp_str_size());
 213     so_cache_zone = zinit (str_size, 120000*str_size, 8192, "socache zone");
 214 #if TEMPDEBUG
 215     printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 216 #endif
 217     timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 218
 219     so_cache_zone_element_size = str_size;
 220
 221     sflt_init();
 222
 223 }
 224
 225 void   cached_sock_alloc(so, waitok)
 226 struct socket **so;
 227 int           waitok;
 228
 229 {
 230     caddr_t     temp;
 231     register u_long  offset;
 232
 233
 234         lck_mtx_lock(so_cache_mtx);
 235
 236     if (cached_sock_count) {
 237             cached_sock_count--;
 238             *so = socket_cache_head;
 239             if (*so == 0)
 240                     panic("cached_sock_alloc: cached sock is null");
 241
 242             socket_cache_head = socket_cache_head->cache_next;
 243             if (socket_cache_head)
 244                     socket_cache_head->cache_prev = 0;
 245             else
 246                     socket_cache_tail = 0;
 247
 248                 lck_mtx_unlock(so_cache_mtx);
 249
 250             temp = (*so)->so_saved_pcb;
 251             bzero((caddr_t)*so, sizeof(struct socket));
 252 #if TEMPDEBUG
 253             kprintf("cached_sock_alloc - retreiving cached sock %x - count == %d\n", *so,
 254                    cached_sock_count);
 255 #endif
 256             (*so)->so_saved_pcb = temp;
 257             (*so)->cached_in_sock_layer = 1;
 258
 259     }
 260     else {
 261 #if TEMPDEBUG
 262             kprintf("Allocating cached sock %x from memory\n", *so);
 263 #endif
 264
 265             lck_mtx_unlock(so_cache_mtx);
 266
 267             if (waitok)
 268                  *so = (struct socket *) zalloc(so_cache_zone);
 269             else
 270                  *so = (struct socket *) zalloc_noblock(so_cache_zone);
 271
 272             if (*so == 0)
 273                  return;
 274
 275             bzero((caddr_t)*so, sizeof(struct socket));
 276
 277             /*
 278              * Define offsets for extra structures into our single block of
 279              * memory. Align extra structures on longword boundaries.
 280              */
 281
 282
 283             offset = (u_long) *so;
 284             offset += sizeof(struct socket);
 285             if (offset & 0x3) {
 286                 offset += 4;
 287                 offset &= 0xfffffffc;
 288             }
 289             (*so)->so_saved_pcb = (caddr_t) offset;
 290             offset += get_inpcb_str_size();
 291             if (offset & 0x3) {
 292                 offset += 4;
 293                 offset &= 0xfffffffc;
 294             }
 295
 296             ((struct inpcb *) (*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t) offset;
 297 #if TEMPDEBUG
 298             kprintf("Allocating cached socket - %x, pcb=%x tcpcb=%x\n", *so,
 299                     (*so)->so_saved_pcb,
 300                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 301 #endif
 302     }
 303
 304     (*so)->cached_in_sock_layer = 1;
 305 }
 306
 307
 308 void cached_sock_free(so)
 309 struct socket *so;
 310 {
 311
 312         lck_mtx_lock(so_cache_mtx);
 313
 314         if (++cached_sock_count > MAX_CACHED_SOCKETS) {
 315                 --cached_sock_count;
 316                 lck_mtx_unlock(so_cache_mtx);
 317 #if TEMPDEBUG
 318                 kprintf("Freeing overflowed cached socket %x\n", so);
 319 #endif
 320                 zfree(so_cache_zone, so);
 321         }
 322         else {
 323 #if TEMPDEBUG
 324                 kprintf("Freeing socket %x into cache\n", so);
 325 #endif
 326                 if (so_cache_hw < cached_sock_count)
 327                         so_cache_hw = cached_sock_count;
 328
 329                 so->cache_next = socket_cache_head;
 330                 so->cache_prev = 0;
 331                 if (socket_cache_head)
 332                         socket_cache_head->cache_prev = so;
 333                 else
 334                         socket_cache_tail = so;
 335
 336                 so->cache_timestamp = so_cache_time;
 337                 socket_cache_head = so;
 338                 lck_mtx_unlock(so_cache_mtx);
 339         }
 340
 341 #if TEMPDEBUG
 342         kprintf("Freed cached sock %x into cache - count is %d\n", so, cached_sock_count);
 343 #endif
 344
 345
 346 }
 347
 348
 349 void so_cache_timer()
 350 {
 351         register struct socket  *p;
 352         register int            n_freed = 0;
 353
 354
 355         lck_mtx_lock(so_cache_mtx);
 356
 357         ++so_cache_time;
 358
 359         while ( (p = socket_cache_tail) )
 360         {
 361                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 362                         break;
 363
 364                 so_cache_timeouts++;
 365
 366                 if ( (socket_cache_tail = p->cache_prev) )
 367                         p->cache_prev->cache_next = 0;
 368                 if (--cached_sock_count == 0)
 369                         socket_cache_head = 0;
 370
 371
 372                 zfree(so_cache_zone, p);
 373
 374                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH)
 375                 {
 376                         so_cache_max_freed++;
 377                         break;
 378                 }
 379         }
 380         lck_mtx_unlock(so_cache_mtx);
 381
 382         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 383
 384
 385 }
 386 #endif /* __APPLE__ */
 387
 388 /*
 389  * Get a socket structure from our zone, and initialize it.
 390  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 391  * Note that it would probably be better to allocate socket
 392  * and PCB at the same time, but I'm not convinced that all
 393  * the protocols can be easily modified to do this.
 394  */
 395 struct socket *
 396 soalloc(waitok, dom, type)
 397         int waitok;
 398         int dom;
 399         int type;
 400 {
 401         struct socket *so;
 402
 403         if ((dom == PF_INET) && (type == SOCK_STREAM))
 404             cached_sock_alloc(&so, waitok);
 405         else
 406         {
 407              MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone, M_WAITOK);
 408              if (so)
 409                   bzero(so, sizeof *so);
 410         }
 411         /* XXX race condition for reentrant kernel */
 412 //###LD Atomic add for so_gencnt
 413         if (so) {
 414              so->so_gencnt = ++so_gencnt;
 415              so->so_zone = socket_zone;
 416         }
 417
 418         return so;
 419 }
 420
 421 int
 422 socreate(dom, aso, type, proto)
 423         int dom;
 424         struct socket **aso;
 425         register int type;
 426         int proto;
 427 {
 428         struct proc *p = current_proc();
 429         register struct protosw *prp;
 430         register struct socket *so;
 431         register int error = 0;
 432 #if TCPDEBUG
 433         extern int tcpconsdebug;
 434 #endif
 435         if (proto)
 436                 prp = pffindproto(dom, proto, type);
 437         else
 438                 prp = pffindtype(dom, type);
 439
 440         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
 441                 return (EPROTONOSUPPORT);
 442 #ifndef __APPLE__
 443
 444         if (p->p_prison && jail_socket_unixiproute_only &&
 445             prp->pr_domain->dom_family != PF_LOCAL &&
 446             prp->pr_domain->dom_family != PF_INET &&
 447             prp->pr_domain->dom_family != PF_ROUTE) {
 448                 return (EPROTONOSUPPORT);
 449         }
 450
 451 #endif
 452         if (prp->pr_type != type)
 453                 return (EPROTOTYPE);
 454         so = soalloc(p != 0, dom, type);
 455         if (so == 0)
 456                 return (ENOBUFS);
 457
 458         TAILQ_INIT(&so->so_incomp);
 459         TAILQ_INIT(&so->so_comp);
 460         so->so_type = type;
 461
 462 #ifdef __APPLE__
 463         if (p != 0) {
 464                 so->so_uid = kauth_cred_getuid(kauth_cred_get());
 465                 if (!suser(kauth_cred_get(),NULL))
 466                         so->so_state = SS_PRIV;
 467         }
 468 #else
 469         so->so_cred = kauth_cred_get_with_ref();
 470 #endif
 471         so->so_proto = prp;
 472 #ifdef __APPLE__
 473         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 474         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 475 #endif
 476
 477 //### Attachement will create the per pcb lock if necessary and increase refcount
 478         so->so_usecount++;      /* for creation, make sure it's done before socket is inserted in lists */
 479
 480         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 481         if (error) {
 482                 /*
 483                  * Warning:
 484                  * If so_pcb is not zero, the socket will be leaked,
 485                  * so protocol attachment handler must be coded carefuly
 486                  */
 487                 so->so_state |= SS_NOFDREF;
 488                 so->so_usecount--;
 489                 sofreelastref(so, 1);   /* will deallocate the socket */
 490                 return (error);
 491         }
 492 #ifdef __APPLE__
 493         prp->pr_domain->dom_refs++;
 494         TAILQ_INIT(&so->so_evlist);
 495
 496         /* Attach socket filters for this protocol */
 497         sflt_initsock(so);
 498 #if TCPDEBUG
 499         if (tcpconsdebug == 2)
 500                 so->so_options |= SO_DEBUG;
 501 #endif
 502 #endif
 503
 504         *aso = so;
 505         return (0);
 506 }
 507
 508 int
 509 sobind(so, nam)
 510         struct socket *so;
 511         struct sockaddr *nam;
 512
 513 {
 514         struct proc *p = current_proc();
 515         int error = 0;
 516         struct socket_filter_entry      *filter;
 517         int                                                     filtered = 0;
 518
 519         socket_lock(so, 1);
 520
 521         /* Socket filter */
 522         error = 0;
 523         for (filter = so->so_filt; filter && (error == 0);
 524                  filter = filter->sfe_next_onsocket) {
 525                 if (filter->sfe_filter->sf_filter.sf_bind) {
 526                         if (filtered == 0) {
 527                                 filtered = 1;
 528                                 sflt_use(so);
 529                                 socket_unlock(so, 0);
 530                         }
 531                         error = filter->sfe_filter->sf_filter.sf_bind(
 532                                                 filter->sfe_cookie, so, nam);
 533                 }
 534         }
 535         if (filtered != 0) {
 536                 socket_lock(so, 0);
 537                 sflt_unuse(so);
 538         }
 539         /* End socket filter */
 540
 541         if (error == 0)
 542                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 543
 544         socket_unlock(so, 1);
 545
 546         if (error == EJUSTRETURN)
 547                 error = 0;
 548
 549         return (error);
 550 }
 551
 552 void
 553 sodealloc(so)
 554         struct socket *so;
 555 {
 556         so->so_gencnt = ++so_gencnt;
 557
 558 #ifndef __APPLE__
 559         if (so->so_rcv.sb_hiwat)
 560                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 561                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 562         if (so->so_snd.sb_hiwat)
 563                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 564                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 565 #ifdef INET
 566         if (so->so_accf != NULL) {
 567                 if (so->so_accf->so_accept_filter != NULL &&
 568                         so->so_accf->so_accept_filter->accf_destroy != NULL) {
 569                         so->so_accf->so_accept_filter->accf_destroy(so);
 570                 }
 571                 if (so->so_accf->so_accept_filter_str != NULL)
 572                         FREE(so->so_accf->so_accept_filter_str, M_ACCF);
 573                 FREE(so->so_accf, M_ACCF);
 574         }
 575 #endif /* INET */
 576         kauth_cred_rele(so->so_cred);
 577         zfreei(so->so_zone, so);
 578 #else
 579         if (so->cached_in_sock_layer == 1)
 580              cached_sock_free(so);
 581         else {
 582              if (so->cached_in_sock_layer == -1)
 583                         panic("sodealloc: double dealloc: so=%x\n", so);
 584              so->cached_in_sock_layer = -1;
 585              FREE_ZONE(so, sizeof(*so), so->so_zone);
 586         }
 587 #endif /* __APPLE__ */
 588 }
 589
 590 int
 591 solisten(so, backlog)
 592         register struct socket *so;
 593         int backlog;
 594
 595 {
 596         struct proc *p = current_proc();
 597         int error;
 598
 599         socket_lock(so, 1);
 600
 601         {
 602                 struct socket_filter_entry      *filter;
 603                 int                                                     filtered = 0;
 604                 error = 0;
 605                 for (filter = so->so_filt; filter && (error == 0);
 606                          filter = filter->sfe_next_onsocket) {
 607                         if (filter->sfe_filter->sf_filter.sf_listen) {
 608                                 if (filtered == 0) {
 609                                         filtered = 1;
 610                                         sflt_use(so);
 611                                         socket_unlock(so, 0);
 612                                 }
 613                                 error = filter->sfe_filter->sf_filter.sf_listen(
 614                                                         filter->sfe_cookie, so);
 615                         }
 616                 }
 617                 if (filtered != 0) {
 618                         socket_lock(so, 0);
 619                         sflt_unuse(so);
 620                 }
 621         }
 622
 623         if (error == 0) {
 624                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 625         }
 626
 627         if (error) {
 628                 socket_unlock(so, 1);
 629                 if (error == EJUSTRETURN)
 630                         error = 0;
 631                 return (error);
 632         }
 633
 634         if (TAILQ_EMPTY(&so->so_comp))
 635                 so->so_options |= SO_ACCEPTCONN;
 636         if (backlog < 0 || backlog > somaxconn)
 637                 backlog = somaxconn;
 638         so->so_qlimit = backlog;
 639
 640         socket_unlock(so, 1);
 641         return (0);
 642 }
 643
 644 void
 645 sofreelastref(so, dealloc)
 646         register struct socket *so;
 647         int dealloc;
 648 {
 649         int error;
 650         struct socket *head = so->so_head;
 651
 652         /*### Assume socket is locked */
 653
 654         /* Remove any filters - may be called more than once */
 655         sflt_termsock(so);
 656
 657         if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) {
 658 #ifdef __APPLE__
 659                 selthreadclear(&so->so_snd.sb_sel);
 660                 selthreadclear(&so->so_rcv.sb_sel);
 661                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 662                 so->so_snd.sb_flags &= ~SB_UPCALL;
 663 #endif
 664                 return;
 665         }
 666         if (head != NULL) {
 667                 socket_lock(head, 1);
 668                 if (so->so_state & SS_INCOMP) {
 669                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 670                         head->so_incqlen--;
 671                 } else if (so->so_state & SS_COMP) {
 672                         /*
 673                          * We must not decommission a socket that's
 674                          * on the accept(2) queue.  If we do, then
 675                          * accept(2) may hang after select(2) indicated
 676                          * that the listening socket was ready.
 677                          */
 678 #ifdef __APPLE__
 679                         selthreadclear(&so->so_snd.sb_sel);
 680                         selthreadclear(&so->so_rcv.sb_sel);
 681                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 682                         so->so_snd.sb_flags &= ~SB_UPCALL;
 683 #endif
 684                         socket_unlock(head, 1);
 685                         return;
 686                 } else {
 687                         panic("sofree: not queued");
 688                 }
 689                 head->so_qlen--;
 690                 so->so_state &= ~SS_INCOMP;
 691                 so->so_head = NULL;
 692                 socket_unlock(head, 1);
 693         }
 694 #ifdef __APPLE__
 695         selthreadclear(&so->so_snd.sb_sel);
 696         sbrelease(&so->so_snd);
 697 #endif
 698         sorflush(so);
 699
 700         /* 3932268: disable upcall */
 701         so->so_rcv.sb_flags &= ~SB_UPCALL;
 702         so->so_snd.sb_flags &= ~SB_UPCALL;
 703
 704         if (dealloc)
 705                 sodealloc(so);
 706 }
 707
 708 /*
 709  * Close a socket on last file table reference removal.
 710  * Initiate disconnect if connected.
 711  * Free socket when disconnect complete.
 712  */
 713 int
 714 soclose_locked(so)
 715         register struct socket *so;
 716 {
 717         int error = 0;
 718         lck_mtx_t * mutex_held;
 719         struct timespec ts;
 720
 721         if (so->so_usecount == 0) {
 722                 panic("soclose: so=%x refcount=0\n", so);
 723         }
 724
 725         sflt_notify(so, sock_evt_closing, NULL);
 726
 727         if ((so->so_options & SO_ACCEPTCONN)) {
 728                 struct socket *sp;
 729
 730                 /* We do not want new connection to be added to the connection queues */
 731                 so->so_options &= ~SO_ACCEPTCONN;
 732
 733                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 734                         /* A bit tricky here. We need to keep
 735                          * a lock if it's a protocol global lock
 736                          * but we want the head, not the socket locked
 737                          * in the case of per-socket lock...
 738                          */
 739                         if (so->so_proto->pr_getlock != NULL) {
 740                                 socket_unlock(so, 0);
 741                                 socket_lock(sp, 1);
 742                         }
 743                         (void) soabort(sp);
 744                         if (so->so_proto->pr_getlock != NULL) {
 745                                 socket_unlock(sp, 1);
 746                                 socket_lock(so, 0);
 747                         }
 748                 }
 749
 750                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 751                         /* Dequeue from so_comp since sofree() won't do it */
 752                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 753                         so->so_qlen--;
 754
 755                         if (so->so_proto->pr_getlock != NULL) {
 756                                 socket_unlock(so, 0);
 757                                 socket_lock(sp, 1);
 758                         }
 759
 760                         sp->so_state &= ~SS_COMP;
 761                         sp->so_head = NULL;
 762
 763                         (void) soabort(sp);
 764                         if (so->so_proto->pr_getlock != NULL) {
 765                                 socket_unlock(sp, 1);
 766                                 socket_lock(so, 0);
 767                         }
 768                 }
 769         }
 770         if (so->so_pcb == 0) {
 771                 /* 3915887: mark the socket as ready for dealloc */
 772                 so->so_flags |= SOF_PCBCLEARING;
 773                 goto discard;
 774         }
 775         if (so->so_state & SS_ISCONNECTED) {
 776                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 777                         error = sodisconnectlocked(so);
 778                         if (error)
 779                                 goto drop;
 780                 }
 781                 if (so->so_options & SO_LINGER) {
 782                         if ((so->so_state & SS_ISDISCONNECTING) &&
 783                             (so->so_state & SS_NBIO))
 784                                 goto drop;
 785                         if (so->so_proto->pr_getlock != NULL)
 786                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 787                         else
 788                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 789                         while (so->so_state & SS_ISCONNECTED) {
 790                                 ts.tv_sec = (so->so_linger/100);
 791                                 ts.tv_nsec = (so->so_linger % 100) * NSEC_PER_USEC * 1000 * 10;
 792                                 error = msleep((caddr_t)&so->so_timeo, mutex_held,
 793                                     PSOCK | PCATCH, "soclos", &ts);
 794                                 if (error) {
 795                                         /* It's OK when the time fires, don't report an error */
 796                                         if (error == EWOULDBLOCK)
 797                                                 error = 0;
 798                                         break;
 799                                 }
 800                         }
 801                 }
 802         }
 803 drop:
 804         if (so->so_usecount == 0)
 805                 panic("soclose: usecount is zero so=%x\n", so);
 806         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
 807                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
 808                 if (error == 0)
 809                         error = error2;
 810         }
 811         if (so->so_usecount <= 0)
 812                 panic("soclose: usecount is zero so=%x\n", so);
 813 discard:
 814         if (so->so_pcb && so->so_state & SS_NOFDREF)
 815                 panic("soclose: NOFDREF");
 816         so->so_state |= SS_NOFDREF;
 817 #ifdef __APPLE__
 818         so->so_proto->pr_domain->dom_refs--;
 819         evsofree(so);
 820 #endif
 821         so->so_usecount--;
 822         sofree(so);
 823         return (error);
 824 }
 825
 826 int
 827 soclose(so)
 828         register struct socket *so;
 829 {
 830         int error = 0;
 831         socket_lock(so, 1);
 832         if (so->so_retaincnt == 0)
 833                 error = soclose_locked(so);
 834         else {  /* if the FD is going away, but socket is retained in kernel remove its reference */
 835                 so->so_usecount--;
 836                 if (so->so_usecount < 2)
 837                         panic("soclose: retaincnt non null and so=%x usecount=%x\n", so->so_usecount);
 838         }
 839         socket_unlock(so, 1);
 840         return (error);
 841 }
 842
 843
 844 /*
 845  * Must be called at splnet...
 846  */
 847 //#### Should already be locked
 848 int
 849 soabort(so)
 850         struct socket *so;
 851 {
 852         int error;
 853
 854 #ifdef MORE_LOCKING_DEBUG
 855         lck_mtx_t * mutex_held;
 856
 857         if (so->so_proto->pr_getlock != NULL)
 858                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 859         else
 860                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 861         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 862 #endif
 863
 864         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
 865         if (error) {
 866                 sofree(so);
 867                 return error;
 868         }
 869         return (0);
 870 }
 871
 872 int
 873 soacceptlock(so, nam, dolock)
 874         register struct socket *so;
 875         struct sockaddr **nam;
 876         int dolock;
 877 {
 878         int error;
 879
 880         if (dolock) socket_lock(so, 1);
 881
 882         if ((so->so_state & SS_NOFDREF) == 0)
 883                 panic("soaccept: !NOFDREF");
 884         so->so_state &= ~SS_NOFDREF;
 885         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 886
 887         if (dolock) socket_unlock(so, 1);
 888         return (error);
 889 }
 890 int
 891 soaccept(so, nam)
 892         register struct socket *so;
 893         struct sockaddr **nam;
 894 {
 895         return (soacceptlock(so, nam, 1));
 896 }
 897
 898 int
 899 soconnectlock(so, nam, dolock)
 900         register struct socket *so;
 901         struct sockaddr *nam;
 902         int dolock;
 903
 904 {
 905         int s;
 906         int error;
 907         struct proc *p = current_proc();
 908
 909         if (dolock) socket_lock(so, 1);
 910
 911         if (so->so_options & SO_ACCEPTCONN) {
 912                 if (dolock) socket_unlock(so, 1);
 913                 return (EOPNOTSUPP);
 914         }
 915         /*
 916          * If protocol is connection-based, can only connect once.
 917          * Otherwise, if connected, try to disconnect first.
 918          * This allows user to disconnect by connecting to, e.g.,
 919          * a null address.
 920          */
 921         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 922             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 923             (error = sodisconnectlocked(so))))
 924                 error = EISCONN;
 925         else {
 926                 /*
 927                  * Run connect filter before calling protocol:
 928                  *  - non-blocking connect returns before completion;
 929                  */
 930                 {
 931                         struct socket_filter_entry      *filter;
 932                         int                                                     filtered = 0;
 933                         error = 0;
 934                         for (filter = so->so_filt; filter && (error == 0);
 935                                  filter = filter->sfe_next_onsocket) {
 936                                 if (filter->sfe_filter->sf_filter.sf_connect_out) {
 937                                         if (filtered == 0) {
 938                                                 filtered = 1;
 939                                                 sflt_use(so);
 940                                                 socket_unlock(so, 0);
 941                                         }
 942                                         error = filter->sfe_filter->sf_filter.sf_connect_out(
 943                                                                 filter->sfe_cookie, so, nam);
 944                                 }
 945                         }
 946                         if (filtered != 0) {
 947                                 socket_lock(so, 0);
 948                                 sflt_unuse(so);
 949                         }
 950                 }
 951                 if (error) {
 952                         if (error == EJUSTRETURN)
 953                                 error = 0;
 954                         if (dolock) socket_unlock(so, 1);
 955                         return error;
 956                 }
 957
 958                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
 959         }
 960         if (dolock) socket_unlock(so, 1);
 961         return (error);
 962 }
 963
 964 int
 965 soconnect(so, nam)
 966         register struct socket *so;
 967         struct sockaddr *nam;
 968 {
 969         return (soconnectlock(so, nam, 1));
 970 }
 971
 972 int
 973 soconnect2(so1, so2)
 974         register struct socket *so1;
 975         struct socket *so2;
 976 {
 977         int error;
 978 //####### Assumes so1 is already locked /
 979
 980         socket_lock(so2, 1);
 981
 982         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 983
 984         socket_unlock(so2, 1);
 985         return (error);
 986 }
 987
 988
 989 int
 990 sodisconnectlocked(so)
 991         register struct socket *so;
 992 {
 993         int error;
 994
 995         if ((so->so_state & SS_ISCONNECTED) == 0) {
 996                 error = ENOTCONN;
 997                 goto bad;
 998         }
 999         if (so->so_state & SS_ISDISCONNECTING) {
1000                 error = EALREADY;
1001                 goto bad;
1002         }
1003
1004         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1005
1006         if (error == 0) {
1007                 sflt_notify(so, sock_evt_disconnected, NULL);
1008         }
1009
1010 bad:
1011         return (error);
1012 }
1013 //### Locking version
1014 int
1015 sodisconnect(so)
1016         register struct socket *so;
1017 {
1018         int error;
1019
1020         socket_lock(so, 1);
1021         error = sodisconnectlocked(so);
1022         socket_unlock(so, 1);
1023         return(error);
1024 }
1025
1026 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1027
1028 /*
1029  * sosendcheck will lock the socket buffer if it isn't locked and
1030  * verify that there is space for the data being inserted.
1031  */
1032
1033 static int
1034 sosendcheck(
1035         struct socket *so,
1036         struct sockaddr *addr,
1037         long resid,
1038         long clen,
1039         long atomic,
1040         int flags,
1041         int *sblocked)
1042 {
1043         int error = 0;
1044         long space;
1045         int     assumelock = 0;
1046
1047 restart:
1048         if (*sblocked == 0) {
1049                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1050                         so->so_send_filt_thread != 0 &&
1051                         so->so_send_filt_thread == current_thread()) {
1052                         /*
1053                          * We're being called recursively from a filter,
1054                          * allow this to continue. Radar 4150520.
1055                          * Don't set sblocked because we don't want
1056                          * to perform an unlock later.
1057                          */
1058                         assumelock = 1;
1059                 }
1060                 else {
1061                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1062                         if (error) {
1063                                 return error;
1064                         }
1065                         *sblocked = 1;
1066                 }
1067         }
1068
1069         if (so->so_state & SS_CANTSENDMORE)
1070                 return EPIPE;
1071
1072         if (so->so_error) {
1073                 error = so->so_error;
1074                 so->so_error = 0;
1075                 return error;
1076         }
1077
1078         if ((so->so_state & SS_ISCONNECTED) == 0) {
1079                 /*
1080                  * `sendto' and `sendmsg' is allowed on a connection-
1081                  * based socket if it supports implied connect.
1082                  * Return ENOTCONN if not connected and no address is
1083                  * supplied.
1084                  */
1085                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1086                         (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1087                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1088                                 !(resid == 0 && clen != 0))
1089                                 return ENOTCONN;
1090                 } else if (addr == 0 && !(flags&MSG_HOLD))
1091                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ? ENOTCONN : EDESTADDRREQ;
1092         }
1093         space = sbspace(&so->so_snd);
1094         if (flags & MSG_OOB)
1095                 space += 1024;
1096         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1097                 clen > so->so_snd.sb_hiwat)
1098                 return EMSGSIZE;
1099         if (space < resid + clen &&
1100                 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1101                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || assumelock) {
1102                         return EWOULDBLOCK;
1103                 }
1104                 sbunlock(&so->so_snd, 1);
1105                 error = sbwait(&so->so_snd);
1106                 if (error) {
1107                         return error;
1108                 }
1109                 goto restart;
1110         }
1111
1112         return 0;
1113 }
1114
1115 /*
1116  * Send on a socket.
1117  * If send must go all at once and message is larger than
1118  * send buffering, then hard error.
1119  * Lock against other senders.
1120  * If must go all at once and not enough room now, then
1121  * inform user that this would block and do nothing.
1122  * Otherwise, if nonblocking, send as much as possible.
1123  * The data to be sent is described by "uio" if nonzero,
1124  * otherwise by the mbuf chain "top" (which must be null
1125  * if uio is not).  Data provided in mbuf chain must be small
1126  * enough to send all at once.
1127  *
1128  * Returns nonzero on error, timeout or signal; callers
1129  * must check for short counts if EINTR/ERESTART are returned.
1130  * Data and control buffers are freed on return.
1131  * Experiment:
1132  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1133  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1134  *  point at the mbuf chain being constructed and go from there.
1135  */
1136 int
1137 sosend(so, addr, uio, top, control, flags)
1138         register struct socket *so;
1139         struct sockaddr *addr;
1140         struct uio *uio;
1141         struct mbuf *top;
1142         struct mbuf *control;
1143         int flags;
1144
1145 {
1146         struct mbuf **mp;
1147         register struct mbuf *m, *freelist = NULL;
1148         register long space, len, resid;
1149         int clen = 0, error, dontroute, mlen, sendflags;
1150         int atomic = sosendallatonce(so) || top;
1151         int sblocked = 0;
1152         struct proc *p = current_proc();
1153
1154         if (uio)
1155                 // LP64todo - fix this!
1156                 resid = uio_resid(uio);
1157         else
1158                 resid = top->m_pkthdr.len;
1159
1160         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START),
1161                      so,
1162                      resid,
1163                      so->so_snd.sb_cc,
1164                      so->so_snd.sb_lowat,
1165                      so->so_snd.sb_hiwat);
1166
1167         socket_lock(so, 1);
1168
1169         /*
1170          * In theory resid should be unsigned.
1171          * However, space must be signed, as it might be less than 0
1172          * if we over-committed, and we must use a signed comparison
1173          * of space and resid.  On the other hand, a negative resid
1174          * causes us to loop sending 0-length segments to the protocol.
1175          *
1176          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1177          * type sockets since that's an error.
1178          */
1179         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1180                 error = EINVAL;
1181                 socket_unlock(so, 1);
1182                 goto out;
1183         }
1184
1185         dontroute =
1186             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1187             (so->so_proto->pr_flags & PR_ATOMIC);
1188         if (p)
1189                 p->p_stats->p_ru.ru_msgsnd++;
1190         if (control)
1191                 clen = control->m_len;
1192
1193         do {
1194                 error = sosendcheck(so, addr, resid, clen, atomic, flags, &sblocked);
1195                 if (error) {
1196                         goto release;
1197                 }
1198                 mp = &top;
1199                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ? 1024 : 0);
1200
1201                 do {
1202
1203                     if (uio == NULL) {
1204                                 /*
1205                                  * Data is prepackaged in "top".
1206                                  */
1207                                 resid = 0;
1208                                 if (flags & MSG_EOR)
1209                                         top->m_flags |= M_EOR;
1210                         } else {
1211                                 int             chainlength;
1212                                 int             bytes_to_copy;
1213
1214                                 bytes_to_copy = min(resid, space);
1215
1216                                 if (sosendminchain > 0) {
1217                                         chainlength = 0;
1218                                 } else
1219                                         chainlength = sosendmaxchain;
1220
1221                                 socket_unlock(so, 0);
1222
1223                                 do {
1224                                         int num_needed;
1225                                         int hdrs_needed = (top == 0) ? 1 : 0;
1226
1227                                         /*
1228                                          * try to maintain a local cache of mbuf clusters needed to complete this write
1229                                          * the list is further limited to the number that are currently needed to fill the socket
1230                                          * this mechanism allows a large number of mbufs/clusters to be grabbed under a single
1231                                          * mbuf lock... if we can't get any clusters, than fall back to trying for mbufs
1232                                          * if we fail early (or miscalcluate the number needed) make sure to release any clusters
1233                                          * we haven't yet consumed.
1234                                          */
1235                                         if (freelist == NULL && bytes_to_copy > MCLBYTES) {
1236                                                 num_needed = bytes_to_copy / NBPG;
1237
1238                                                 if ((bytes_to_copy - (num_needed * NBPG)) >= MINCLSIZE)
1239                                                         num_needed++;
1240
1241                                                 freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, NBPG);
1242                                                 /* Fall back to cluster size if allocation failed */
1243                                         }
1244
1245                                         if (freelist == NULL && bytes_to_copy > MINCLSIZE) {
1246                                                 num_needed = bytes_to_copy / MCLBYTES;
1247
1248                                                 if ((bytes_to_copy - (num_needed * MCLBYTES)) >= MINCLSIZE)
1249                                                         num_needed++;
1250
1251                                                 freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, MCLBYTES);
1252                                                 /* Fall back to a single mbuf if allocation failed */
1253                                         }
1254
1255                                         if (freelist == NULL) {
1256                                                 if (top == 0)
1257                                                         MGETHDR(freelist, M_WAIT, MT_DATA);
1258                                                 else
1259                                                         MGET(freelist, M_WAIT, MT_DATA);
1260
1261                                                 if (freelist == NULL) {
1262                                                         error = ENOBUFS;
1263                                                         socket_lock(so, 0);
1264                                                         goto release;
1265                                                 }
1266                                                 /*
1267                                                  * For datagram protocols, leave room
1268                                                  * for protocol headers in first mbuf.
1269                                                  */
1270                                                 if (atomic && top == 0 && bytes_to_copy < MHLEN)
1271                                                         MH_ALIGN(freelist, bytes_to_copy);
1272                                         }
1273                                         m = freelist;
1274                                         freelist = m->m_next;
1275                                         m->m_next = NULL;
1276
1277                                         if ((m->m_flags & M_EXT))
1278                                                 mlen = m->m_ext.ext_size;
1279                                         else if ((m->m_flags & M_PKTHDR))
1280                                                 mlen = MHLEN - m_leadingspace(m);
1281                                         else
1282                                                 mlen = MLEN;
1283                                         len = min(mlen, bytes_to_copy);
1284
1285                                         chainlength += len;
1286
1287                                         space -= len;
1288
1289                                         error = uiomove(mtod(m, caddr_t), (int)len, uio);
1290
1291                                         // LP64todo - fix this!
1292                                         resid = uio_resid(uio);
1293
1294                                         m->m_len = len;
1295                                         *mp = m;
1296                                         top->m_pkthdr.len += len;
1297                                         if (error)
1298                                                 break;
1299                                         mp = &m->m_next;
1300                                         if (resid <= 0) {
1301                                                 if (flags & MSG_EOR)
1302                                                         top->m_flags |= M_EOR;
1303                                                 break;
1304                                         }
1305                                         bytes_to_copy = min(resid, space);
1306
1307                                 } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE));
1308
1309                                 socket_lock(so, 0);
1310
1311                                 if (error)
1312                                         goto release;
1313                         }
1314
1315                     if (flags & (MSG_HOLD|MSG_SEND))
1316                     {
1317                                 /* Enqueue for later, go away if HOLD */
1318                                 register struct mbuf *mb1;
1319                                 if (so->so_temp && (flags & MSG_FLUSH))
1320                                 {
1321                                         m_freem(so->so_temp);
1322                                         so->so_temp = NULL;
1323                                 }
1324                                 if (so->so_temp)
1325                                         so->so_tail->m_next = top;
1326                                 else
1327                                         so->so_temp = top;
1328                                 mb1 = top;
1329                                 while (mb1->m_next)
1330                                                 mb1 = mb1->m_next;
1331                                 so->so_tail = mb1;
1332                                 if (flags & MSG_HOLD)
1333                                 {
1334                                         top = NULL;
1335                                         goto release;
1336                                 }
1337                                 top = so->so_temp;
1338                     }
1339                     if (dontroute)
1340                             so->so_options |= SO_DONTROUTE;
1341                     /* Compute flags here, for pru_send and NKEs */
1342                     sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1343                         /*
1344                          * If the user set MSG_EOF, the protocol
1345                          * understands this flag and nothing left to
1346                          * send then use PRU_SEND_EOF instead of PRU_SEND.
1347                          */
1348                         ((flags & MSG_EOF) &&
1349                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1350                          (resid <= 0)) ?
1351                                 PRUS_EOF :
1352                         /* If there is more to send set PRUS_MORETOCOME */
1353                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1354
1355                         /*
1356                          * Socket filter processing
1357                          */
1358                         {
1359                                 struct socket_filter_entry *filter;
1360                                 int                                                     filtered;
1361
1362                                 filtered = 0;
1363                                 error = 0;
1364                                 for (filter = so->so_filt; filter && (error == 0);
1365                                          filter = filter->sfe_next_onsocket) {
1366                                         if (filter->sfe_filter->sf_filter.sf_data_out) {
1367                                                 int so_flags = 0;
1368                                                 if (filtered == 0) {
1369                                                         filtered = 1;
1370                                                         so->so_send_filt_thread = current_thread();
1371                                                         sflt_use(so);
1372                                                         socket_unlock(so, 0);
1373                                                         so_flags = (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0;
1374                                                 }
1375                                                 error = filter->sfe_filter->sf_filter.sf_data_out(
1376                                                                         filter->sfe_cookie, so, addr, &top, &control, so_flags);
1377                                         }
1378                                 }
1379
1380                                 if (filtered) {
1381                                         /*
1382                                          * At this point, we've run at least one filter.
1383                                          * The socket is unlocked as is the socket buffer.
1384                                          */
1385                                         socket_lock(so, 0);
1386                                         sflt_unuse(so);
1387                                         so->so_send_filt_thread = 0;
1388                                         if (error) {
1389                                                 if (error == EJUSTRETURN) {
1390                                                         error = 0;
1391                                                         clen = 0;
1392                                                         control = 0;
1393                                                         top = 0;
1394                                                 }
1395
1396                                                 goto release;
1397                                         }
1398                                 }
1399                         }
1400                         /*
1401                          * End Socket filter processing
1402                          */
1403
1404                         if (error == EJUSTRETURN) {
1405                                 /* A socket filter handled this data */
1406                                 error = 0;
1407                         }
1408                         else {
1409                                 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1410                                                         sendflags, top, addr, control, p);
1411                         }
1412 #ifdef __APPLE__
1413                     if (flags & MSG_SEND)
1414                         so->so_temp = NULL;
1415 #endif
1416                     if (dontroute)
1417                             so->so_options &= ~SO_DONTROUTE;
1418                     clen = 0;
1419                     control = 0;
1420                     top = 0;
1421                     mp = &top;
1422                     if (error)
1423                         goto release;
1424                 } while (resid && space > 0);
1425         } while (resid);
1426
1427 release:
1428         if (sblocked)
1429                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1430         else
1431                 socket_unlock(so, 1);
1432 out:
1433         if (top)
1434                 m_freem(top);
1435         if (control)
1436                 m_freem(control);
1437         if (freelist)
1438                 m_freem_list(freelist);
1439
1440         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END,
1441                      so,
1442                      resid,
1443                      so->so_snd.sb_cc,
1444                      space,
1445                      error);
1446
1447         return (error);
1448 }
1449
1450 /*
1451  * Implement receive operations on a socket.
1452  * We depend on the way that records are added to the sockbuf
1453  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1454  * must begin with an address if the protocol so specifies,
1455  * followed by an optional mbuf or mbufs containing ancillary data,
1456  * and then zero or more mbufs of data.
1457  * In order to avoid blocking network interrupts for the entire time here,
1458  * we splx() while doing the actual copy to user space.
1459  * Although the sockbuf is locked, new data may still be appended,
1460  * and thus we must maintain consistency of the sockbuf during that time.
1461  *
1462  * The caller may receive the data as a single mbuf chain by supplying
1463  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1464  * only for the count in uio_resid.
1465  */
1466 int
1467 soreceive(so, psa, uio, mp0, controlp, flagsp)
1468         register struct socket *so;
1469         struct sockaddr **psa;
1470         struct uio *uio;
1471         struct mbuf **mp0;
1472         struct mbuf **controlp;
1473         int *flagsp;
1474 {
1475         register struct mbuf *m, **mp, *ml = NULL;
1476         register int flags, len, error, offset;
1477         struct protosw *pr = so->so_proto;
1478         struct mbuf *nextrecord;
1479         int moff, type = 0;
1480                 // LP64todo - fix this!
1481         int orig_resid = uio_resid(uio);
1482         volatile struct mbuf *free_list;
1483         volatile int delayed_copy_len;
1484         int can_delay;
1485         int need_event;
1486         struct proc *p = current_proc();
1487
1488
1489                 // LP64todo - fix this!
1490         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START,
1491                      so,
1492                      uio_resid(uio),
1493                      so->so_rcv.sb_cc,
1494                      so->so_rcv.sb_lowat,
1495                      so->so_rcv.sb_hiwat);
1496
1497         socket_lock(so, 1);
1498
1499 #ifdef MORE_LOCKING_DEBUG
1500         if (so->so_usecount == 1)
1501                 panic("soreceive: so=%x no other reference on socket\n", so);
1502 #endif
1503         mp = mp0;
1504         if (psa)
1505                 *psa = 0;
1506         if (controlp)
1507                 *controlp = 0;
1508         if (flagsp)
1509                 flags = *flagsp &~ MSG_EOR;
1510         else
1511                 flags = 0;
1512         /*
1513          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1514          * regardless of the flags argument. Here is the case were
1515          * out-of-band data is not inline.
1516          */
1517         if ((flags & MSG_OOB) ||
1518             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1519              (so->so_options & SO_OOBINLINE) == 0 &&
1520              (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1521                 m = m_get(M_WAIT, MT_DATA);
1522                 if (m == NULL) {
1523                         socket_unlock(so, 1);
1524                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, ENOBUFS,0,0,0,0);
1525                         return (ENOBUFS);
1526                 }
1527                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1528                 if (error)
1529                         goto bad;
1530                 socket_unlock(so, 0);
1531                 do {
1532                 // LP64todo - fix this!
1533                         error = uiomove(mtod(m, caddr_t),
1534                             (int) min(uio_resid(uio), m->m_len), uio);
1535                         m = m_free(m);
1536                 } while (uio_resid(uio) && error == 0 && m);
1537                 socket_lock(so, 0);
1538 bad:
1539                 if (m)
1540                         m_freem(m);
1541 #ifdef __APPLE__
1542                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1543                         if (error == EWOULDBLOCK || error == EINVAL) {
1544                                 /*
1545                                  * Let's try to get normal data:
1546                                  *  EWOULDBLOCK: out-of-band data not receive yet;
1547                                  *  EINVAL: out-of-band data already read.
1548                                  */
1549                                 error = 0;
1550                                 goto nooob;
1551                         } else if (error == 0 && flagsp)
1552                                 *flagsp |= MSG_OOB;
1553                 }
1554                 socket_unlock(so, 1);
1555                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1556 #endif
1557                 return (error);
1558         }
1559 nooob:
1560         if (mp)
1561                 *mp = (struct mbuf *)0;
1562         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
1563                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1564
1565
1566         free_list = (struct mbuf *)0;
1567         delayed_copy_len = 0;
1568 restart:
1569 #ifdef MORE_LOCKING_DEBUG
1570         if (so->so_usecount <= 1)
1571                 printf("soreceive: sblock so=%x ref=%d on socket\n", so, so->so_usecount);
1572 #endif
1573         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1574         if (error) {
1575                 socket_unlock(so, 1);
1576                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1577                 return (error);
1578         }
1579
1580         m = so->so_rcv.sb_mb;
1581         /*
1582          * If we have less data than requested, block awaiting more
1583          * (subject to any timeout) if:
1584          *   1. the current count is less than the low water mark, or
1585          *   2. MSG_WAITALL is set, and it is possible to do the entire
1586          *      receive operation at once if we block (resid <= hiwat).
1587          *   3. MSG_DONTWAIT is not set
1588          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1589          * we have to do the receive in sections, and thus risk returning
1590          * a short count if a timeout or signal occurs after we start.
1591          */
1592         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
1593             so->so_rcv.sb_cc < uio_resid(uio)) &&
1594            (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1595             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
1596             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
1597
1598                 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
1599                 if (so->so_error) {
1600                         if (m)
1601                                 goto dontblock;
1602                         error = so->so_error;
1603                         if ((flags & MSG_PEEK) == 0)
1604                                 so->so_error = 0;
1605                         goto release;
1606                 }
1607                 if (so->so_state & SS_CANTRCVMORE) {
1608                         if (m)
1609                                 goto dontblock;
1610                         else
1611                                 goto release;
1612                 }
1613                 for (; m; m = m->m_next)
1614                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1615                                 m = so->so_rcv.sb_mb;
1616                                 goto dontblock;
1617                         }
1618                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1619                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1620                         error = ENOTCONN;
1621                         goto release;
1622                 }
1623                 if (uio_resid(uio) == 0)
1624                         goto release;
1625                 if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1626                         error = EWOULDBLOCK;
1627                         goto release;
1628                 }
1629                 sbunlock(&so->so_rcv, 1);
1630 #ifdef EVEN_MORE_LOCKING_DEBUG
1631                 if (socket_debug)
1632                     printf("Waiting for socket data\n");
1633 #endif
1634
1635                 error = sbwait(&so->so_rcv);
1636 #ifdef EVEN_MORE_LOCKING_DEBUG
1637                 if (socket_debug)
1638                     printf("SORECEIVE - sbwait returned %d\n", error);
1639 #endif
1640                 if (so->so_usecount < 1)
1641                         panic("soreceive: after 2nd sblock so=%x ref=%d on socket\n", so, so->so_usecount);
1642                 if (error) {
1643                         socket_unlock(so, 1);
1644                     KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1645                     return (error);
1646                 }
1647                 goto restart;
1648         }
1649 dontblock:
1650 #ifndef __APPLE__
1651         if (uio->uio_procp)
1652                 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
1653 #else   /* __APPLE__ */
1654         /*
1655          * 2207985
1656          * This should be uio->uio-procp; however, some callers of this
1657          * function use auto variables with stack garbage, and fail to
1658          * fill out the uio structure properly.
1659          */
1660         if (p)
1661                 p->p_stats->p_ru.ru_msgrcv++;
1662 #endif  /* __APPLE__ */
1663         nextrecord = m->m_nextpkt;
1664         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
1665                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1666                 orig_resid = 0;
1667                 if (psa) {
1668                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
1669                                             mp0 == 0);
1670                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
1671                                 error = EWOULDBLOCK;
1672                                 goto release;
1673                         }
1674                 }
1675                 if (flags & MSG_PEEK) {
1676                         m = m->m_next;
1677                 } else {
1678                         sbfree(&so->so_rcv, m);
1679                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
1680                                 panic("soreceive: about to create invalid socketbuf");
1681                         MFREE(m, so->so_rcv.sb_mb);
1682                         m = so->so_rcv.sb_mb;
1683                 }
1684         }
1685         while (m && m->m_type == MT_CONTROL && error == 0) {
1686                 if (flags & MSG_PEEK) {
1687                         if (controlp)
1688                                 *controlp = m_copy(m, 0, m->m_len);
1689                         m = m->m_next;
1690                 } else {
1691                         sbfree(&so->so_rcv, m);
1692                         if (controlp) {
1693                                 if (pr->pr_domain->dom_externalize &&
1694                                     mtod(m, struct cmsghdr *)->cmsg_type ==
1695                                     SCM_RIGHTS) {
1696                                    socket_unlock(so, 0); /* release socket lock: see 3903171 */
1697                                    error = (*pr->pr_domain->dom_externalize)(m);
1698                                    socket_lock(so, 0);
1699                                 }
1700                                 *controlp = m;
1701                                 if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
1702                                         panic("soreceive: so->so_rcv.sb_mb->m_next == 0 && so->so_rcv.sb_cc != 0");
1703                                 so->so_rcv.sb_mb = m->m_next;
1704                                 m->m_next = 0;
1705                                 m = so->so_rcv.sb_mb;
1706                         } else {
1707                                 MFREE(m, so->so_rcv.sb_mb);
1708                                 m = so->so_rcv.sb_mb;
1709                         }
1710                 }
1711                 if (controlp) {
1712                         orig_resid = 0;
1713                         controlp = &(*controlp)->m_next;
1714                 }
1715         }
1716         if (m) {
1717                 if ((flags & MSG_PEEK) == 0)
1718                         m->m_nextpkt = nextrecord;
1719                 type = m->m_type;
1720                 if (type == MT_OOBDATA)
1721                         flags |= MSG_OOB;
1722         }
1723         moff = 0;
1724         offset = 0;
1725
1726         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
1727                 can_delay = 1;
1728         else
1729                 can_delay = 0;
1730
1731         need_event = 0;
1732
1733         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1734                 if (m->m_type == MT_OOBDATA) {
1735                         if (type != MT_OOBDATA)
1736                                 break;
1737                 } else if (type == MT_OOBDATA)
1738                         break;
1739 #ifndef __APPLE__
1740 /*
1741  * This assertion needs rework.  The trouble is Appletalk is uses many
1742  * mbuf types (NOT listed in mbuf.h!) which will trigger this panic.
1743  * For now just remove the assertion...  CSM 9/98
1744  */
1745                 else
1746                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1747                         ("receive 3"));
1748 #else
1749                 /*
1750                  * Make sure to allways set MSG_OOB event when getting
1751                  * out of band data inline.
1752                  */
1753                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1754                         (so->so_options & SO_OOBINLINE) != 0 &&
1755                         (so->so_state & SS_RCVATMARK) != 0) {
1756                         flags |= MSG_OOB;
1757                 }
1758 #endif
1759                 so->so_state &= ~SS_RCVATMARK;
1760                 // LP64todo - fix this!
1761                 len = uio_resid(uio) - delayed_copy_len;
1762                 if (so->so_oobmark && len > so->so_oobmark - offset)
1763                         len = so->so_oobmark - offset;
1764                 if (len > m->m_len - moff)
1765                         len = m->m_len - moff;
1766                 /*
1767                  * If mp is set, just pass back the mbufs.
1768                  * Otherwise copy them out via the uio, then free.
1769                  * Sockbuf must be consistent here (points to current mbuf,
1770                  * it points to next record) when we drop priority;
1771                  * we must note any additions to the sockbuf when we
1772                  * block interrupts again.
1773                  */
1774                 if (mp == 0) {
1775                         if (can_delay && len == m->m_len) {
1776                                 /*
1777                                  * only delay the copy if we're consuming the
1778                                  * mbuf and we're NOT in MSG_PEEK mode
1779                                  * and we have enough data to make it worthwile
1780                                  * to drop and retake the funnel... can_delay
1781                                  * reflects the state of the 2 latter constraints
1782                                  * moff should always be zero in these cases
1783                                  */
1784                                 delayed_copy_len += len;
1785                         } else {
1786
1787                                 if (delayed_copy_len) {
1788                                         error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1789
1790                                         if (error) {
1791                                                 goto release;
1792                                         }
1793                                         if (m != so->so_rcv.sb_mb) {
1794                                                 /*
1795                                                  * can only get here if MSG_PEEK is not set
1796                                                  * therefore, m should point at the head of the rcv queue...
1797                                                  * if it doesn't, it means something drastically changed
1798                                                  * while we were out from behind the funnel in sodelayed_copy...
1799                                                  * perhaps a RST on the stream... in any event, the stream has
1800                                                  * been interrupted... it's probably best just to return
1801                                                  * whatever data we've moved and let the caller sort it out...
1802                                                  */
1803                                                 break;
1804                                         }
1805                                 }
1806                                 socket_unlock(so, 0);
1807                                 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
1808                                 socket_lock(so, 0);
1809
1810                                 if (error)
1811                                         goto release;
1812                         }
1813                 } else
1814                         uio_setresid(uio, (uio_resid(uio) - len));
1815
1816                 if (len == m->m_len - moff) {
1817                         if (m->m_flags & M_EOR)
1818                                 flags |= MSG_EOR;
1819                         if (flags & MSG_PEEK) {
1820                                 m = m->m_next;
1821                                 moff = 0;
1822                         } else {
1823                                 nextrecord = m->m_nextpkt;
1824                                 sbfree(&so->so_rcv, m);
1825                                 m->m_nextpkt = NULL;
1826
1827                                 if (mp) {
1828                                         *mp = m;
1829                                         mp = &m->m_next;
1830                                         so->so_rcv.sb_mb = m = m->m_next;
1831                                         *mp = (struct mbuf *)0;
1832                                 } else {
1833                                         if (free_list == NULL)
1834                                             free_list = m;
1835                                         else
1836                                             ml->m_next = m;
1837                                         ml = m;
1838                                         so->so_rcv.sb_mb = m = m->m_next;
1839                                         ml->m_next = 0;
1840                                 }
1841                                 if (m)
1842                                         m->m_nextpkt = nextrecord;
1843                         }
1844                 } else {
1845                         if (flags & MSG_PEEK)
1846                                 moff += len;
1847                         else {
1848                                 if (mp)
1849                                         *mp = m_copym(m, 0, len, M_WAIT);
1850                                 m->m_data += len;
1851                                 m->m_len -= len;
1852                                 so->so_rcv.sb_cc -= len;
1853                         }
1854                 }
1855                 if (so->so_oobmark) {
1856                         if ((flags & MSG_PEEK) == 0) {
1857                                 so->so_oobmark -= len;
1858                                 if (so->so_oobmark == 0) {
1859                                     so->so_state |= SS_RCVATMARK;
1860                                     /*
1861                                      * delay posting the actual event until after
1862                                      * any delayed copy processing has finished
1863                                      */
1864                                     need_event = 1;
1865                                     break;
1866                                 }
1867                         } else {
1868                                 offset += len;
1869                                 if (offset == so->so_oobmark)
1870                                         break;
1871                         }
1872                 }
1873                 if (flags & MSG_EOR)
1874                         break;
1875                 /*
1876                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set (for non-atomic socket),
1877                  * we must not quit until "uio->uio_resid == 0" or an error
1878                  * termination.  If a signal/timeout occurs, return
1879                  * with a short count but without error.
1880                  * Keep sockbuf locked against other readers.
1881                  */
1882                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 && (uio_resid(uio) - delayed_copy_len) > 0 &&
1883                     !sosendallatonce(so) && !nextrecord) {
1884                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
1885                                 goto release;
1886
1887                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb && (((struct inpcb *)so->so_pcb)->inp_state != INPCB_STATE_DEAD))
1888                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1889                         if (sbwait(&so->so_rcv)) {
1890                                 error = 0;
1891                                 goto release;
1892                         }
1893                         /*
1894                          * have to wait until after we get back from the sbwait to do the copy because
1895                          * we will drop the funnel if we have enough data that has been delayed... by dropping
1896                          * the funnel we open up a window allowing the netisr thread to process the incoming packets
1897                          * and to change the state of this socket... we're issuing the sbwait because
1898                          * the socket is empty and we're expecting the netisr thread to wake us up when more
1899                          * packets arrive... if we allow that processing to happen and then sbwait, we
1900                          * could stall forever with packets sitting in the socket if no further packets
1901                          * arrive from the remote side.
1902                          *
1903                          * we want to copy before we've collected all the data to satisfy this request to
1904                          * allow the copy to overlap the incoming packet processing on an MP system
1905                          */
1906                         if (delayed_copy_len > sorecvmincopy && (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
1907
1908                                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1909
1910                                 if (error)
1911                                         goto release;
1912                         }
1913                         m = so->so_rcv.sb_mb;
1914                         if (m) {
1915                                 nextrecord = m->m_nextpkt;
1916                         }
1917                 }
1918         }
1919 #ifdef MORE_LOCKING_DEBUG
1920         if (so->so_usecount <= 1)
1921                 panic("soreceive: after big while so=%x ref=%d on socket\n", so, so->so_usecount);
1922 #endif
1923
1924         if (m && pr->pr_flags & PR_ATOMIC) {
1925 #ifdef __APPLE__
1926                 if (so->so_options & SO_DONTTRUNC)
1927                         flags |= MSG_RCVMORE;
1928                 else {
1929 #endif
1930                         flags |= MSG_TRUNC;
1931                         if ((flags & MSG_PEEK) == 0)
1932                                 (void) sbdroprecord(&so->so_rcv);
1933 #ifdef __APPLE__
1934                 }
1935 #endif
1936         }
1937         if ((flags & MSG_PEEK) == 0) {
1938                 if (m == 0)
1939                         so->so_rcv.sb_mb = nextrecord;
1940                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1941                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1942         }
1943 #ifdef __APPLE__
1944         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
1945                 flags |= MSG_HAVEMORE;
1946
1947         if (delayed_copy_len) {
1948                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1949
1950                 if (error)
1951                         goto release;
1952         }
1953         if (free_list) {
1954                 m_freem_list((struct mbuf *)free_list);
1955                 free_list = (struct mbuf *)0;
1956         }
1957         if (need_event)
1958                 postevent(so, 0, EV_OOB);
1959 #endif
1960         if (orig_resid == uio_resid(uio) && orig_resid &&
1961             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1962                 sbunlock(&so->so_rcv, 1);
1963                 goto restart;
1964         }
1965
1966         if (flagsp)
1967                 *flagsp |= flags;
1968 release:
1969 #ifdef MORE_LOCKING_DEBUG
1970         if (so->so_usecount <= 1)
1971                 panic("soreceive: release so=%x ref=%d on socket\n", so, so->so_usecount);
1972 #endif
1973         if (delayed_copy_len) {
1974                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1975         }
1976         if (free_list) {
1977                 m_freem_list((struct mbuf *)free_list);
1978         }
1979         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
1980
1981                 // LP64todo - fix this!
1982         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1983                      so,
1984                      uio_resid(uio),
1985                      so->so_rcv.sb_cc,
1986                      0,
1987                      error);
1988
1989         return (error);
1990 }
1991
1992
1993 static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid)
1994 {
1995         int         error  = 0;
1996         struct mbuf *m;
1997
1998         m = *free_list;
1999
2000         socket_unlock(so, 0);
2001
2002         while (m && error == 0) {
2003
2004                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2005
2006                 m = m->m_next;
2007         }
2008         m_freem_list(*free_list);
2009
2010         *free_list = (struct mbuf *)NULL;
2011         *resid = 0;
2012
2013         socket_lock(so, 0);
2014
2015         return (error);
2016 }
2017
2018
2019 int
2020 soshutdown(so, how)
2021         register struct socket *so;
2022         register int how;
2023 {
2024         register struct protosw *pr = so->so_proto;
2025         int ret;
2026
2027         socket_lock(so, 1);
2028
2029         sflt_notify(so, sock_evt_shutdown, &how);
2030
2031         if (how != SHUT_WR) {
2032                 sorflush(so);
2033                 postevent(so, 0, EV_RCLOSED);
2034         }
2035         if (how != SHUT_RD) {
2036             ret = ((*pr->pr_usrreqs->pru_shutdown)(so));
2037             postevent(so, 0, EV_WCLOSED);
2038             KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
2039                 socket_unlock(so, 1);
2040             return(ret);
2041         }
2042
2043         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
2044         socket_unlock(so, 1);
2045         return (0);
2046 }
2047
2048 void
2049 sorflush(so)
2050         register struct socket *so;
2051 {
2052         register struct sockbuf *sb = &so->so_rcv;
2053         register struct protosw *pr = so->so_proto;
2054         struct sockbuf asb;
2055
2056 #ifdef MORE_LOCKING_DEBUG
2057         lck_mtx_t * mutex_held;
2058
2059         if (so->so_proto->pr_getlock != NULL)
2060                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2061         else
2062                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2063         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2064 #endif
2065
2066         sflt_notify(so, sock_evt_flush_read, NULL);
2067
2068         sb->sb_flags |= SB_NOINTR;
2069         (void) sblock(sb, M_WAIT);
2070         socantrcvmore(so);
2071         sbunlock(sb, 1);
2072 #ifdef __APPLE__
2073         selthreadclear(&sb->sb_sel);
2074 #endif
2075         asb = *sb;
2076         bzero((caddr_t)sb, sizeof (*sb));
2077         sb->sb_so = so; /* reestablish link to socket */
2078         if (asb.sb_flags & SB_KNOTE) {
2079                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2080                 sb->sb_flags = SB_KNOTE;
2081         }
2082         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
2083                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2084         sbrelease(&asb);
2085 }
2086
2087 /*
2088  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2089  * an additional variant to handle the case where the option value needs
2090  * to be some kind of integer, but not a specific size.
2091  * In addition to their use here, these functions are also called by the
2092  * protocol-level pr_ctloutput() routines.
2093  */
2094 int
2095 sooptcopyin(sopt, buf, len, minlen)
2096         struct  sockopt *sopt;
2097         void    *buf;
2098         size_t  len;
2099         size_t  minlen;
2100 {
2101         size_t  valsize;
2102
2103         /*
2104          * If the user gives us more than we wanted, we ignore it,
2105          * but if we don't get the minimum length the caller
2106          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2107          * is set to however much we actually retrieved.
2108          */
2109         if ((valsize = sopt->sopt_valsize) < minlen)
2110                 return EINVAL;
2111         if (valsize > len)
2112                 sopt->sopt_valsize = valsize = len;
2113
2114         if (sopt->sopt_p != 0)
2115                 return (copyin(sopt->sopt_val, buf, valsize));
2116
2117         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2118         return 0;
2119 }
2120
2121 int
2122 sosetopt(so, sopt)
2123         struct socket *so;
2124         struct sockopt *sopt;
2125 {
2126         int     error, optval;
2127         struct  linger l;
2128         struct  timeval tv;
2129         short   val;
2130
2131         socket_lock(so, 1);
2132
2133         if (sopt->sopt_dir != SOPT_SET) {
2134                 sopt->sopt_dir = SOPT_SET;
2135         }
2136
2137         {
2138                 struct socket_filter_entry      *filter;
2139                 int                                                     filtered = 0;
2140                 error = 0;
2141                 for (filter = so->so_filt; filter && (error == 0);
2142                          filter = filter->sfe_next_onsocket) {
2143                         if (filter->sfe_filter->sf_filter.sf_setoption) {
2144                                 if (filtered == 0) {
2145                                         filtered = 1;
2146                                         sflt_use(so);
2147                                         socket_unlock(so, 0);
2148                                 }
2149                                 error = filter->sfe_filter->sf_filter.sf_setoption(
2150                                                         filter->sfe_cookie, so, sopt);
2151                         }
2152                 }
2153
2154                 if (filtered != 0) {
2155                         socket_lock(so, 0);
2156                         sflt_unuse(so);
2157
2158                         if (error) {
2159                                 if (error == EJUSTRETURN)
2160                                         error = 0;
2161                                 goto bad;
2162                         }
2163                 }
2164         }
2165
2166         error = 0;
2167         if (sopt->sopt_level != SOL_SOCKET) {
2168                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2169                         error = (*so->so_proto->pr_ctloutput)
2170                                   (so, sopt);
2171                         socket_unlock(so, 1);
2172                         return (error);
2173                 }
2174                 error = ENOPROTOOPT;
2175         } else {
2176                 switch (sopt->sopt_name) {
2177                 case SO_LINGER:
2178                 case SO_LINGER_SEC:
2179                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2180                         if (error)
2181                                 goto bad;
2182
2183                         so->so_linger = (sopt->sopt_name == SO_LINGER) ? l.l_linger : l.l_linger * hz;
2184                         if (l.l_onoff)
2185                                 so->so_options |= SO_LINGER;
2186                         else
2187                                 so->so_options &= ~SO_LINGER;
2188                         break;
2189
2190                 case SO_DEBUG:
2191                 case SO_KEEPALIVE:
2192                 case SO_DONTROUTE:
2193                 case SO_USELOOPBACK:
2194                 case SO_BROADCAST:
2195                 case SO_REUSEADDR:
2196                 case SO_REUSEPORT:
2197                 case SO_OOBINLINE:
2198                 case SO_TIMESTAMP:
2199 #ifdef __APPLE__
2200                 case SO_DONTTRUNC:
2201                 case SO_WANTMORE:
2202                 case SO_WANTOOBFLAG:
2203 #endif
2204                         error = sooptcopyin(sopt, &optval, sizeof optval,
2205                                             sizeof optval);
2206                         if (error)
2207                                 goto bad;
2208                         if (optval)
2209                                 so->so_options |= sopt->sopt_name;
2210                         else
2211                                 so->so_options &= ~sopt->sopt_name;
2212                         break;
2213
2214                 case SO_SNDBUF:
2215                 case SO_RCVBUF:
2216                 case SO_SNDLOWAT:
2217                 case SO_RCVLOWAT:
2218                         error = sooptcopyin(sopt, &optval, sizeof optval,
2219                                             sizeof optval);
2220                         if (error)
2221                                 goto bad;
2222
2223                         /*
2224                          * Values < 1 make no sense for any of these
2225                          * options, so disallow them.
2226                          */
2227                         if (optval < 1) {
2228                                 error = EINVAL;
2229                                 goto bad;
2230                         }
2231
2232                         switch (sopt->sopt_name) {
2233                         case SO_SNDBUF:
2234                         case SO_RCVBUF:
2235                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2236                                               &so->so_snd : &so->so_rcv,
2237                                               (u_long) optval) == 0) {
2238                                         error = ENOBUFS;
2239                                         goto bad;
2240                                 }
2241                                 break;
2242
2243                         /*
2244                          * Make sure the low-water is never greater than
2245                          * the high-water.
2246                          */
2247                         case SO_SNDLOWAT:
2248                                 so->so_snd.sb_lowat =
2249                                     (optval > so->so_snd.sb_hiwat) ?
2250                                     so->so_snd.sb_hiwat : optval;
2251                                 break;
2252                         case SO_RCVLOWAT:
2253                                 so->so_rcv.sb_lowat =
2254                                     (optval > so->so_rcv.sb_hiwat) ?
2255                                     so->so_rcv.sb_hiwat : optval;
2256                                 break;
2257                         }
2258                         break;
2259
2260                 case SO_SNDTIMEO:
2261                 case SO_RCVTIMEO:
2262                         error = sooptcopyin(sopt, &tv, sizeof tv,
2263                                             sizeof tv);
2264                         if (error)
2265                                 goto bad;
2266
2267                         if (tv.tv_sec < 0 || tv.tv_sec > LONG_MAX ||
2268                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2269                                 error = EDOM;
2270                                 goto bad;
2271                         }
2272
2273                         switch (sopt->sopt_name) {
2274                         case SO_SNDTIMEO:
2275                                 so->so_snd.sb_timeo = tv;
2276                                 break;
2277                         case SO_RCVTIMEO:
2278                                 so->so_rcv.sb_timeo = tv;
2279                                 break;
2280                         }
2281                         break;
2282
2283                 case SO_NKE:
2284                 {
2285                         struct so_nke nke;
2286
2287                         error = sooptcopyin(sopt, &nke,
2288                                                                 sizeof nke, sizeof nke);
2289                         if (error)
2290                           goto bad;
2291
2292                         error = sflt_attach_private(so, NULL, nke.nke_handle, 1);
2293                         break;
2294                 }
2295
2296                 case SO_NOSIGPIPE:
2297                         error = sooptcopyin(sopt, &optval, sizeof optval,
2298                                             sizeof optval);
2299                         if (error)
2300                                 goto bad;
2301                         if (optval)
2302                                 so->so_flags |= SOF_NOSIGPIPE;
2303                         else
2304                                 so->so_flags &= ~SOF_NOSIGPIPE;
2305
2306                         break;
2307
2308                 case SO_NOADDRERR:
2309                         error = sooptcopyin(sopt, &optval, sizeof optval,
2310                                             sizeof optval);
2311                         if (error)
2312                                 goto bad;
2313                         if (optval)
2314                                 so->so_flags |= SOF_NOADDRAVAIL;
2315                         else
2316                                 so->so_flags &= ~SOF_NOADDRAVAIL;
2317
2318                         break;
2319
2320                 default:
2321                         error = ENOPROTOOPT;
2322                         break;
2323                 }
2324                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
2325                         (void) ((*so->so_proto->pr_ctloutput)
2326                                   (so, sopt));
2327                 }
2328         }
2329 bad:
2330         socket_unlock(so, 1);
2331         return (error);
2332 }
2333
2334 /* Helper routine for getsockopt */
2335 int
2336 sooptcopyout(sopt, buf, len)
2337         struct  sockopt *sopt;
2338         void    *buf;
2339         size_t  len;
2340 {
2341         int     error;
2342         size_t  valsize;
2343
2344         error = 0;
2345
2346         /*
2347          * Documented get behavior is that we always return a value,
2348          * possibly truncated to fit in the user's buffer.
2349          * Traditional behavior is that we always tell the user
2350          * precisely how much we copied, rather than something useful
2351          * like the total amount we had available for her.
2352          * Note that this interface is not idempotent; the entire answer must
2353          * generated ahead of time.
2354          */
2355         valsize = min(len, sopt->sopt_valsize);
2356         sopt->sopt_valsize = valsize;
2357         if (sopt->sopt_val != USER_ADDR_NULL) {
2358                 if (sopt->sopt_p != 0)
2359                         error = copyout(buf, sopt->sopt_val, valsize);
2360                 else
2361                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
2362         }
2363         return error;
2364 }
2365
2366 int
2367 sogetopt(so, sopt)
2368         struct socket *so;
2369         struct sockopt *sopt;
2370 {
2371         int     error, optval;
2372         struct  linger l;
2373         struct  timeval tv;
2374
2375         if (sopt->sopt_dir != SOPT_GET) {
2376                 sopt->sopt_dir = SOPT_GET;
2377         }
2378
2379         socket_lock(so, 1);
2380
2381         {
2382                 struct socket_filter_entry      *filter;
2383                 int                                                     filtered = 0;
2384                 error = 0;
2385                 for (filter = so->so_filt; filter && (error == 0);
2386                          filter = filter->sfe_next_onsocket) {
2387                         if (filter->sfe_filter->sf_filter.sf_getoption) {
2388                                 if (filtered == 0) {
2389                                         filtered = 1;
2390                                         sflt_use(so);
2391                                         socket_unlock(so, 0);
2392                                 }
2393                                 error = filter->sfe_filter->sf_filter.sf_getoption(
2394                                                         filter->sfe_cookie, so, sopt);
2395                         }
2396                 }
2397                 if (filtered != 0) {
2398                         socket_lock(so, 0);
2399                         sflt_unuse(so);
2400
2401                         if (error) {
2402                                 if (error == EJUSTRETURN)
2403                                         error = 0;
2404                                 socket_unlock(so, 1);
2405                                 return error;
2406                         }
2407                 }
2408         }
2409
2410         error = 0;
2411         if (sopt->sopt_level != SOL_SOCKET) {
2412                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2413                         error = (*so->so_proto->pr_ctloutput)
2414                                   (so, sopt);
2415                         socket_unlock(so, 1);
2416                         return (error);
2417                 } else {
2418                         socket_unlock(so, 1);
2419                         return (ENOPROTOOPT);
2420                 }
2421         } else {
2422                 switch (sopt->sopt_name) {
2423                 case SO_LINGER:
2424                 case SO_LINGER_SEC:
2425                         l.l_onoff = so->so_options & SO_LINGER;
2426                         l.l_linger = (sopt->sopt_name == SO_LINGER) ? so->so_linger :
2427                                 so->so_linger / hz;
2428                         error = sooptcopyout(sopt, &l, sizeof l);
2429                         break;
2430
2431                 case SO_USELOOPBACK:
2432                 case SO_DONTROUTE:
2433                 case SO_DEBUG:
2434                 case SO_KEEPALIVE:
2435                 case SO_REUSEADDR:
2436                 case SO_REUSEPORT:
2437                 case SO_BROADCAST:
2438                 case SO_OOBINLINE:
2439                 case SO_TIMESTAMP:
2440 #ifdef __APPLE__
2441                 case SO_DONTTRUNC:
2442                 case SO_WANTMORE:
2443                 case SO_WANTOOBFLAG:
2444 #endif
2445                         optval = so->so_options & sopt->sopt_name;
2446 integer:
2447                         error = sooptcopyout(sopt, &optval, sizeof optval);
2448                         break;
2449
2450                 case SO_TYPE:
2451                         optval = so->so_type;
2452                         goto integer;
2453
2454 #ifdef __APPLE__
2455                 case SO_NREAD:
2456                 {
2457                         int pkt_total;
2458                         struct mbuf *m1;
2459
2460                         pkt_total = 0;
2461                         m1 = so->so_rcv.sb_mb;
2462                         if (so->so_proto->pr_flags & PR_ATOMIC)
2463                         {
2464                                 while (m1) {
2465                                         if (m1->m_type == MT_DATA)
2466                                                 pkt_total += m1->m_len;
2467                                         m1 = m1->m_next;
2468                                 }
2469                                 optval = pkt_total;
2470                         } else
2471                                 optval = so->so_rcv.sb_cc;
2472                         goto integer;
2473                 }
2474                 case SO_NWRITE:
2475                         optval = so->so_snd.sb_cc;
2476                         goto integer;
2477 #endif
2478                 case SO_ERROR:
2479                         optval = so->so_error;
2480                         so->so_error = 0;
2481                         goto integer;
2482
2483                 case SO_SNDBUF:
2484                         optval = so->so_snd.sb_hiwat;
2485                         goto integer;
2486
2487                 case SO_RCVBUF:
2488                         optval = so->so_rcv.sb_hiwat;
2489                         goto integer;
2490
2491                 case SO_SNDLOWAT:
2492                         optval = so->so_snd.sb_lowat;
2493                         goto integer;
2494
2495                 case SO_RCVLOWAT:
2496                         optval = so->so_rcv.sb_lowat;
2497                         goto integer;
2498
2499                 case SO_SNDTIMEO:
2500                 case SO_RCVTIMEO:
2501                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
2502                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2503
2504                         error = sooptcopyout(sopt, &tv, sizeof tv);
2505                         break;
2506
2507                 case SO_NOSIGPIPE:
2508                         optval = (so->so_flags & SOF_NOSIGPIPE);
2509                         goto integer;
2510
2511                 case SO_NOADDRERR:
2512                         optval = (so->so_flags & SOF_NOADDRAVAIL);
2513                         goto integer;
2514
2515                 default:
2516                         error = ENOPROTOOPT;
2517                         break;
2518                 }
2519                 socket_unlock(so, 1);
2520                 return (error);
2521         }
2522 }
2523
2524 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2525 int
2526 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2527 {
2528         struct mbuf *m, *m_prev;
2529         int sopt_size = sopt->sopt_valsize;
2530
2531         if (sopt_size > MAX_SOOPTGETM_SIZE)
2532                 return EMSGSIZE;
2533
2534         MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2535         if (m == 0)
2536                 return ENOBUFS;
2537         if (sopt_size > MLEN) {
2538                 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2539                 if ((m->m_flags & M_EXT) == 0) {
2540                         m_free(m);
2541                         return ENOBUFS;
2542                 }
2543                 m->m_len = min(MCLBYTES, sopt_size);
2544         } else {
2545                 m->m_len = min(MLEN, sopt_size);
2546         }
2547         sopt_size -= m->m_len;
2548         *mp = m;
2549         m_prev = m;
2550
2551         while (sopt_size) {
2552                 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2553                 if (m == 0) {
2554                         m_freem(*mp);
2555                         return ENOBUFS;
2556                 }
2557                 if (sopt_size > MLEN) {
2558                         MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2559                         if ((m->m_flags & M_EXT) == 0) {
2560                                 m_freem(*mp);
2561                                 return ENOBUFS;
2562                         }
2563                         m->m_len = min(MCLBYTES, sopt_size);
2564                 } else {
2565                         m->m_len = min(MLEN, sopt_size);
2566                 }
2567                 sopt_size -= m->m_len;
2568                 m_prev->m_next = m;
2569                 m_prev = m;
2570         }
2571         return 0;
2572 }
2573
2574 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2575 int
2576 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2577 {
2578         struct mbuf *m0 = m;
2579
2580         if (sopt->sopt_val == USER_ADDR_NULL)
2581                 return 0;
2582         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2583                 if (sopt->sopt_p != NULL) {
2584                         int error;
2585
2586                         error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len);
2587                         if (error != 0) {
2588                                 m_freem(m0);
2589                                 return(error);
2590                         }
2591                 } else
2592                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), mtod(m, char *), m->m_len);
2593                 sopt->sopt_valsize -= m->m_len;
2594                 sopt->sopt_val += m->m_len;
2595                 m = m->m_next;
2596         }
2597         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2598                 panic("soopt_mcopyin");
2599         return 0;
2600 }
2601
2602 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2603 int
2604 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2605 {
2606         struct mbuf *m0 = m;
2607         size_t valsize = 0;
2608
2609         if (sopt->sopt_val == USER_ADDR_NULL)
2610                 return 0;
2611         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2612                 if (sopt->sopt_p != NULL) {
2613                         int error;
2614
2615                         error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len);
2616                         if (error != 0) {
2617                                 m_freem(m0);
2618                                 return(error);
2619                         }
2620                 } else
2621                         bcopy(mtod(m, char *), CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
2622                sopt->sopt_valsize -= m->m_len;
2623                sopt->sopt_val += m->m_len;
2624                valsize += m->m_len;
2625                m = m->m_next;
2626         }
2627         if (m != NULL) {
2628                 /* enough soopt buffer should be given from user-land */
2629                 m_freem(m0);
2630                 return(EINVAL);
2631         }
2632         sopt->sopt_valsize = valsize;
2633         return 0;
2634 }
2635
2636 void
2637 sohasoutofband(so)
2638         register struct socket *so;
2639 {
2640         struct proc *p;
2641
2642         if (so->so_pgid < 0)
2643                 gsignal(-so->so_pgid, SIGURG);
2644         else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
2645                 psignal(p, SIGURG);
2646         selwakeup(&so->so_rcv.sb_sel);
2647 }
2648
2649 int
2650 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
2651 {
2652         struct proc *p = current_proc();
2653         int revents = 0;
2654
2655         socket_lock(so, 1);
2656
2657         if (events & (POLLIN | POLLRDNORM))
2658                 if (soreadable(so))
2659                         revents |= events & (POLLIN | POLLRDNORM);
2660
2661         if (events & (POLLOUT | POLLWRNORM))
2662                 if (sowriteable(so))
2663                         revents |= events & (POLLOUT | POLLWRNORM);
2664
2665         if (events & (POLLPRI | POLLRDBAND))
2666                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
2667                         revents |= events & (POLLPRI | POLLRDBAND);
2668
2669         if (revents == 0) {
2670                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2671                         /* Darwin sets the flag first, BSD calls selrecord first */
2672                         so->so_rcv.sb_flags |= SB_SEL;
2673                         selrecord(p, &so->so_rcv.sb_sel, wql);
2674                 }
2675
2676                 if (events & (POLLOUT | POLLWRNORM)) {
2677                         /* Darwin sets the flag first, BSD calls selrecord first */
2678                         so->so_snd.sb_flags |= SB_SEL;
2679                         selrecord(p, &so->so_snd.sb_sel, wql);
2680                 }
2681         }
2682
2683         socket_unlock(so, 1);
2684         return (revents);
2685 }
2686
2687 int     soo_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p);
2688
2689 int
2690 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p)
2691 {
2692         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2693         struct sockbuf *sb;
2694         socket_lock(so, 1);
2695
2696         switch (kn->kn_filter) {
2697         case EVFILT_READ:
2698                 if (so->so_options & SO_ACCEPTCONN)
2699                         kn->kn_fop = &solisten_filtops;
2700                 else
2701                         kn->kn_fop = &soread_filtops;
2702                 sb = &so->so_rcv;
2703                 break;
2704         case EVFILT_WRITE:
2705                 kn->kn_fop = &sowrite_filtops;
2706                 sb = &so->so_snd;
2707                 break;
2708         default:
2709                 socket_unlock(so, 1);
2710                 return (1);
2711         }
2712
2713         if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
2714                 sb->sb_flags |= SB_KNOTE;
2715         socket_unlock(so, 1);
2716         return (0);
2717 }
2718
2719 static void
2720 filt_sordetach(struct knote *kn)
2721 {
2722         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2723
2724         socket_lock(so, 1);
2725         if (so->so_rcv.sb_flags & SB_KNOTE)
2726                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
2727                         so->so_rcv.sb_flags &= ~SB_KNOTE;
2728         socket_unlock(so, 1);
2729 }
2730
2731 /*ARGSUSED*/
2732 static int
2733 filt_soread(struct knote *kn, long hint)
2734 {
2735         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2736
2737         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2738                 socket_lock(so, 1);
2739
2740         if (so->so_oobmark) {
2741                 if (kn->kn_flags & EV_OOBAND) {
2742                         kn->kn_data = so->so_rcv.sb_cc - so->so_oobmark;
2743                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2744                                 socket_unlock(so, 1);
2745                         return (1);
2746                 }
2747                 kn->kn_data = so->so_oobmark;
2748                 kn->kn_flags |= EV_OOBAND;
2749         } else {
2750                 kn->kn_data = so->so_rcv.sb_cc;
2751                 if (so->so_state & SS_CANTRCVMORE) {
2752                         kn->kn_flags |= EV_EOF;
2753                         kn->kn_fflags = so->so_error;
2754                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2755                                 socket_unlock(so, 1);
2756                         return (1);
2757                 }
2758         }
2759
2760         if (so->so_state & SS_RCVATMARK) {
2761                 if (kn->kn_flags & EV_OOBAND) {
2762                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2763                                 socket_unlock(so, 1);
2764                         return (1);
2765                 }
2766                 kn->kn_flags |= EV_OOBAND;
2767         } else if (kn->kn_flags & EV_OOBAND) {
2768                 kn->kn_data = 0;
2769                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2770                         socket_unlock(so, 1);
2771                 return (0);
2772         }
2773
2774         if (so->so_error) {     /* temporary udp error */
2775                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2776                         socket_unlock(so, 1);
2777                 return (1);
2778         }
2779
2780         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2781                 socket_unlock(so, 1);
2782
2783         return( kn->kn_flags & EV_OOBAND ||
2784                 kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
2785                                 kn->kn_sdata : so->so_rcv.sb_lowat));
2786 }
2787
2788 static void
2789 filt_sowdetach(struct knote *kn)
2790 {
2791         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2792         socket_lock(so, 1);
2793
2794         if(so->so_snd.sb_flags & SB_KNOTE)
2795                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
2796                         so->so_snd.sb_flags &= ~SB_KNOTE;
2797         socket_unlock(so, 1);
2798 }
2799
2800 /*ARGSUSED*/
2801 static int
2802 filt_sowrite(struct knote *kn, long hint)
2803 {
2804         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2805
2806         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2807                 socket_lock(so, 1);
2808
2809         kn->kn_data = sbspace(&so->so_snd);
2810         if (so->so_state & SS_CANTSENDMORE) {
2811                 kn->kn_flags |= EV_EOF;
2812                 kn->kn_fflags = so->so_error;
2813                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2814                         socket_unlock(so, 1);
2815                 return (1);
2816         }
2817         if (so->so_error) {     /* temporary udp error */
2818                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2819                         socket_unlock(so, 1);
2820                 return (1);
2821         }
2822         if (((so->so_state & SS_ISCONNECTED) == 0) &&
2823             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2824                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2825                         socket_unlock(so, 1);
2826                 return (0);
2827         }
2828         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2829                 socket_unlock(so, 1);
2830         if (kn->kn_sfflags & NOTE_LOWAT)
2831                 return (kn->kn_data >= kn->kn_sdata);
2832         return (kn->kn_data >= so->so_snd.sb_lowat);
2833 }
2834
2835 /*ARGSUSED*/
2836 static int
2837 filt_solisten(struct knote *kn, long hint)
2838 {
2839         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2840         int isempty;
2841
2842         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2843                 socket_lock(so, 1);
2844         kn->kn_data = so->so_qlen;
2845         isempty = ! TAILQ_EMPTY(&so->so_comp);
2846         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2847                 socket_unlock(so, 1);
2848         return (isempty);
2849 }
2850
2851
2852 int
2853 socket_lock(so, refcount)
2854         struct socket *so;
2855         int refcount;
2856 {
2857         int error = 0, lr, lr_saved;
2858 #ifdef __ppc__
2859         __asm__ volatile("mflr %0" : "=r" (lr));
2860         lr_saved = lr;
2861 #endif
2862
2863         if (so->so_proto->pr_lock) {
2864                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2865         }
2866         else {
2867 #ifdef MORE_LOCKING_DEBUG
2868                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED);
2869 #endif
2870                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
2871                 if (refcount)
2872                         so->so_usecount++;
2873                 so->reserved3 = (void*)lr_saved; /* save caller for refcount going to zero */
2874         }
2875
2876         return(error);
2877
2878 }
2879
2880 int
2881 socket_unlock(so, refcount)
2882         struct socket *so;
2883         int refcount;
2884 {
2885         int error = 0, lr, lr_saved;
2886         lck_mtx_t * mutex_held;
2887
2888 #ifdef __ppc__
2889 __asm__ volatile("mflr %0" : "=r" (lr));
2890         lr_saved = lr;
2891 #endif
2892
2893
2894
2895         if (so->so_proto == NULL)
2896                 panic("socket_unlock null so_proto so=%x\n", so);
2897
2898         if (so && so->so_proto->pr_unlock)
2899                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2900         else {
2901                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2902 #ifdef MORE_LOCKING_DEBUG
2903                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2904 #endif
2905                 if (refcount) {
2906                         if (so->so_usecount <= 0)
2907                                 panic("socket_unlock: bad refcount so=%x value=%d\n", so, so->so_usecount);
2908                         so->so_usecount--;
2909                         if (so->so_usecount == 0) {
2910                                 sofreelastref(so, 1);
2911                         }
2912                         else
2913                                 so->reserved4 = (void*)lr_saved; /* save caller */
2914                 }
2915                 lck_mtx_unlock(mutex_held);
2916         }
2917
2918         return(error);
2919 }
2920 //### Called with socket locked, will unlock socket
2921 void
2922 sofree(so)
2923         struct socket *so;
2924 {
2925
2926         int lr, lr_saved;
2927         lck_mtx_t * mutex_held;
2928 #ifdef __ppc__
2929         __asm__ volatile("mflr %0" : "=r" (lr));
2930         lr_saved = lr;
2931 #endif
2932         if (so->so_proto->pr_getlock != NULL)
2933                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2934         else
2935                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2936         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2937
2938         sofreelastref(so, 0);
2939 }
2940
2941 void
2942 soreference(so)
2943         struct socket *so;
2944 {
2945         socket_lock(so, 1);     /* locks & take one reference on socket */
2946         socket_unlock(so, 0);   /* unlock only */
2947 }
2948
2949 void
2950 sodereference(so)
2951         struct socket *so;
2952 {
2953         socket_lock(so, 0);
2954         socket_unlock(so, 1);
2955 }