bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
  29 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  30 /*
  31  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  32  *      The Regents of the University of California.  All rights reserved.
  33  *
  34  * Redistribution and use in source and binary forms, with or without
  35  * modification, are permitted provided that the following conditions
  36  * are met:
  37  * 1. Redistributions of source code must retain the above copyright
  38  *    notice, this list of conditions and the following disclaimer.
  39  * 2. Redistributions in binary form must reproduce the above copyright
  40  *    notice, this list of conditions and the following disclaimer in the
  41  *    documentation and/or other materials provided with the distribution.
  42  * 3. All advertising materials mentioning features or use of this software
  43  *    must display the following acknowledgement:
  44  *      This product includes software developed by the University of
  45  *      California, Berkeley and its contributors.
  46  * 4. Neither the name of the University nor the names of its contributors
  47  *    may be used to endorse or promote products derived from this software
  48  *    without specific prior written permission.
  49  *
  50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  60  * SUCH DAMAGE.
  61  *
  62  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  63  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  64  */
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/filedesc.h>
  69 #include <sys/proc_internal.h>
  70 #include <sys/kauth.h>
  71 #include <sys/file_internal.h>
  72 #include <sys/fcntl.h>
  73 #include <sys/malloc.h>
  74 #include <sys/mbuf.h>
  75 #include <sys/domain.h>
  76 #include <sys/kernel.h>
  77 #include <sys/event.h>
  78 #include <sys/poll.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/resourcevar.h>
  83 #include <sys/signalvar.h>
  84 #include <sys/sysctl.h>
  85 #include <sys/uio.h>
  86 #include <sys/ev.h>
  87 #include <sys/kdebug.h>
  88 #include <net/route.h>
  89 #include <netinet/in.h>
  90 #include <netinet/in_pcb.h>
  91 #include <kern/zalloc.h>
  92 #include <kern/locks.h>
  93 #include <machine/limits.h>
  94
  95 int                     so_cache_hw = 0;
  96 int                     so_cache_timeouts = 0;
  97 int                     so_cache_max_freed = 0;
  98 int                     cached_sock_count = 0;
  99 struct socket           *socket_cache_head = 0;
 100 struct socket           *socket_cache_tail = 0;
 101 u_long                  so_cache_time = 0;
 102 int                     so_cache_init_done = 0;
 103 struct zone             *so_cache_zone;
 104 extern int              get_inpcb_str_size();
 105 extern int              get_tcp_str_size();
 106
 107 static lck_grp_t                *so_cache_mtx_grp;
 108 static lck_attr_t               *so_cache_mtx_attr;
 109 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 110 lck_mtx_t                               *so_cache_mtx;
 111
 112 #include <machine/limits.h>
 113
 114 static void     filt_sordetach(struct knote *kn);
 115 static int      filt_soread(struct knote *kn, long hint);
 116 static void     filt_sowdetach(struct knote *kn);
 117 static int      filt_sowrite(struct knote *kn, long hint);
 118 static int      filt_solisten(struct knote *kn, long hint);
 119
 120 static struct filterops solisten_filtops =
 121   { 1, NULL, filt_sordetach, filt_solisten };
 122 static struct filterops soread_filtops =
 123   { 1, NULL, filt_sordetach, filt_soread };
 124 static struct filterops sowrite_filtops =
 125   { 1, NULL, filt_sowdetach, filt_sowrite };
 126
 127 #define EVEN_MORE_LOCKING_DEBUG 0
 128 int socket_debug = 0;
 129 int socket_zone = M_SOCKET;
 130 so_gen_t        so_gencnt;      /* generation count for sockets */
 131
 132 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 133 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 134
 135 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 136 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 137 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 138 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 139 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 140 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 141 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 142
 143 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 144
 145
 146 SYSCTL_DECL(_kern_ipc);
 147
 148 static int somaxconn = SOMAXCONN;
 149 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
 150            0, "");
 151
 152 /* Should we get a maximum also ??? */
 153 static int sosendmaxchain = 65536;
 154 static int sosendminchain = 16384;
 155 static int sorecvmincopy  = 16384;
 156 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
 157            0, "");
 158 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
 159            0, "");
 160
 161 void  so_cache_timer();
 162
 163 /*
 164  * Socket operation routines.
 165  * These routines are called by the routines in
 166  * sys_socket.c or from a system process, and
 167  * implement the semantics of socket operations by
 168  * switching out to the protocol specific routines.
 169  */
 170
 171 #ifdef __APPLE__
 172
 173 vm_size_t       so_cache_zone_element_size;
 174
 175 static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid);
 176
 177
 178 void socketinit()
 179 {
 180     vm_size_t   str_size;
 181
 182         if (so_cache_init_done) {
 183                 printf("socketinit: already called...\n");
 184                 return;
 185         }
 186
 187         /*
 188          * allocate lock group attribute and group for socket cache mutex
 189          */
 190         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 191         lck_grp_attr_setdefault(so_cache_mtx_grp_attr);
 192
 193         so_cache_mtx_grp = lck_grp_alloc_init("so_cache", so_cache_mtx_grp_attr);
 194
 195         /*
 196          * allocate the lock attribute for socket cache mutex
 197          */
 198         so_cache_mtx_attr = lck_attr_alloc_init();
 199         lck_attr_setdefault(so_cache_mtx_attr);
 200
 201     so_cache_init_done = 1;
 202
 203     so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);     /* cached sockets mutex */
 204
 205     if (so_cache_mtx == NULL)
 206                 return; /* we're hosed... */
 207
 208     str_size = (vm_size_t)( sizeof(struct socket) + 4 +
 209                             get_inpcb_str_size()  + 4 +
 210                             get_tcp_str_size());
 211     so_cache_zone = zinit (str_size, 120000*str_size, 8192, "socache zone");
 212 #if TEMPDEBUG
 213     printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 214 #endif
 215     timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 216
 217     so_cache_zone_element_size = str_size;
 218
 219     sflt_init();
 220
 221 }
 222
 223 void   cached_sock_alloc(so, waitok)
 224 struct socket **so;
 225 int           waitok;
 226
 227 {
 228     caddr_t     temp;
 229     register u_long  offset;
 230
 231
 232         lck_mtx_lock(so_cache_mtx);
 233
 234     if (cached_sock_count) {
 235             cached_sock_count--;
 236             *so = socket_cache_head;
 237             if (*so == 0)
 238                     panic("cached_sock_alloc: cached sock is null");
 239
 240             socket_cache_head = socket_cache_head->cache_next;
 241             if (socket_cache_head)
 242                     socket_cache_head->cache_prev = 0;
 243             else
 244                     socket_cache_tail = 0;
 245
 246                 lck_mtx_unlock(so_cache_mtx);
 247
 248             temp = (*so)->so_saved_pcb;
 249             bzero((caddr_t)*so, sizeof(struct socket));
 250 #if TEMPDEBUG
 251             kprintf("cached_sock_alloc - retreiving cached sock %x - count == %d\n", *so,
 252                    cached_sock_count);
 253 #endif
 254             (*so)->so_saved_pcb = temp;
 255             (*so)->cached_in_sock_layer = 1;
 256
 257     }
 258     else {
 259 #if TEMPDEBUG
 260             kprintf("Allocating cached sock %x from memory\n", *so);
 261 #endif
 262
 263             lck_mtx_unlock(so_cache_mtx);
 264
 265             if (waitok)
 266                  *so = (struct socket *) zalloc(so_cache_zone);
 267             else
 268                  *so = (struct socket *) zalloc_noblock(so_cache_zone);
 269
 270             if (*so == 0)
 271                  return;
 272
 273             bzero((caddr_t)*so, sizeof(struct socket));
 274
 275             /*
 276              * Define offsets for extra structures into our single block of
 277              * memory. Align extra structures on longword boundaries.
 278              */
 279
 280
 281             offset = (u_long) *so;
 282             offset += sizeof(struct socket);
 283             if (offset & 0x3) {
 284                 offset += 4;
 285                 offset &= 0xfffffffc;
 286             }
 287             (*so)->so_saved_pcb = (caddr_t) offset;
 288             offset += get_inpcb_str_size();
 289             if (offset & 0x3) {
 290                 offset += 4;
 291                 offset &= 0xfffffffc;
 292             }
 293
 294             ((struct inpcb *) (*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t) offset;
 295 #if TEMPDEBUG
 296             kprintf("Allocating cached socket - %x, pcb=%x tcpcb=%x\n", *so,
 297                     (*so)->so_saved_pcb,
 298                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 299 #endif
 300     }
 301
 302     (*so)->cached_in_sock_layer = 1;
 303 }
 304
 305
 306 void cached_sock_free(so)
 307 struct socket *so;
 308 {
 309
 310         lck_mtx_lock(so_cache_mtx);
 311
 312         if (++cached_sock_count > MAX_CACHED_SOCKETS) {
 313                 --cached_sock_count;
 314                 lck_mtx_unlock(so_cache_mtx);
 315 #if TEMPDEBUG
 316                 kprintf("Freeing overflowed cached socket %x\n", so);
 317 #endif
 318                 zfree(so_cache_zone, so);
 319         }
 320         else {
 321 #if TEMPDEBUG
 322                 kprintf("Freeing socket %x into cache\n", so);
 323 #endif
 324                 if (so_cache_hw < cached_sock_count)
 325                         so_cache_hw = cached_sock_count;
 326
 327                 so->cache_next = socket_cache_head;
 328                 so->cache_prev = 0;
 329                 if (socket_cache_head)
 330                         socket_cache_head->cache_prev = so;
 331                 else
 332                         socket_cache_tail = so;
 333
 334                 so->cache_timestamp = so_cache_time;
 335                 socket_cache_head = so;
 336                 lck_mtx_unlock(so_cache_mtx);
 337         }
 338
 339 #if TEMPDEBUG
 340         kprintf("Freed cached sock %x into cache - count is %d\n", so, cached_sock_count);
 341 #endif
 342
 343
 344 }
 345
 346
 347 void so_cache_timer()
 348 {
 349         register struct socket  *p;
 350         register int            n_freed = 0;
 351
 352
 353         lck_mtx_lock(so_cache_mtx);
 354
 355         ++so_cache_time;
 356
 357         while ( (p = socket_cache_tail) )
 358         {
 359                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 360                         break;
 361
 362                 so_cache_timeouts++;
 363
 364                 if ( (socket_cache_tail = p->cache_prev) )
 365                         p->cache_prev->cache_next = 0;
 366                 if (--cached_sock_count == 0)
 367                         socket_cache_head = 0;
 368
 369
 370                 zfree(so_cache_zone, p);
 371
 372                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH)
 373                 {
 374                         so_cache_max_freed++;
 375                         break;
 376                 }
 377         }
 378         lck_mtx_unlock(so_cache_mtx);
 379
 380         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 381
 382
 383 }
 384 #endif /* __APPLE__ */
 385
 386 /*
 387  * Get a socket structure from our zone, and initialize it.
 388  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 389  * Note that it would probably be better to allocate socket
 390  * and PCB at the same time, but I'm not convinced that all
 391  * the protocols can be easily modified to do this.
 392  */
 393 struct socket *
 394 soalloc(waitok, dom, type)
 395         int waitok;
 396         int dom;
 397         int type;
 398 {
 399         struct socket *so;
 400
 401         if ((dom == PF_INET) && (type == SOCK_STREAM))
 402             cached_sock_alloc(&so, waitok);
 403         else
 404         {
 405              MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone, M_WAITOK);
 406              if (so)
 407                   bzero(so, sizeof *so);
 408         }
 409         /* XXX race condition for reentrant kernel */
 410 //###LD Atomic add for so_gencnt
 411         if (so) {
 412              so->so_gencnt = ++so_gencnt;
 413              so->so_zone = socket_zone;
 414         }
 415
 416         return so;
 417 }
 418
 419 int
 420 socreate(dom, aso, type, proto)
 421         int dom;
 422         struct socket **aso;
 423         register int type;
 424         int proto;
 425 {
 426         struct proc *p = current_proc();
 427         register struct protosw *prp;
 428         register struct socket *so;
 429         register int error = 0;
 430 #if TCPDEBUG
 431         extern int tcpconsdebug;
 432 #endif
 433         if (proto)
 434                 prp = pffindproto(dom, proto, type);
 435         else
 436                 prp = pffindtype(dom, type);
 437
 438         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
 439                 return (EPROTONOSUPPORT);
 440 #ifndef __APPLE__
 441
 442         if (p->p_prison && jail_socket_unixiproute_only &&
 443             prp->pr_domain->dom_family != PF_LOCAL &&
 444             prp->pr_domain->dom_family != PF_INET &&
 445             prp->pr_domain->dom_family != PF_ROUTE) {
 446                 return (EPROTONOSUPPORT);
 447         }
 448
 449 #endif
 450         if (prp->pr_type != type)
 451                 return (EPROTOTYPE);
 452         so = soalloc(p != 0, dom, type);
 453         if (so == 0)
 454                 return (ENOBUFS);
 455
 456         TAILQ_INIT(&so->so_incomp);
 457         TAILQ_INIT(&so->so_comp);
 458         so->so_type = type;
 459
 460 #ifdef __APPLE__
 461         if (p != 0) {
 462                 so->so_uid = kauth_cred_getuid(kauth_cred_get());
 463                 if (!suser(kauth_cred_get(),NULL))
 464                         so->so_state = SS_PRIV;
 465         }
 466 #else
 467         so->so_cred = kauth_cred_get_with_ref();
 468 #endif
 469         so->so_proto = prp;
 470 #ifdef __APPLE__
 471         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 472         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 473 #endif
 474
 475 //### Attachement will create the per pcb lock if necessary and increase refcount
 476         so->so_usecount++;      /* for creation, make sure it's done before socket is inserted in lists */
 477
 478         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 479         if (error) {
 480                 /*
 481                  * Warning:
 482                  * If so_pcb is not zero, the socket will be leaked,
 483                  * so protocol attachment handler must be coded carefuly
 484                  */
 485                 so->so_state |= SS_NOFDREF;
 486                 so->so_usecount--;
 487                 sofreelastref(so, 1);   /* will deallocate the socket */
 488                 return (error);
 489         }
 490 #ifdef __APPLE__
 491         prp->pr_domain->dom_refs++;
 492         TAILQ_INIT(&so->so_evlist);
 493
 494         /* Attach socket filters for this protocol */
 495         sflt_initsock(so);
 496 #if TCPDEBUG
 497         if (tcpconsdebug == 2)
 498                 so->so_options |= SO_DEBUG;
 499 #endif
 500 #endif
 501
 502         *aso = so;
 503         return (0);
 504 }
 505
 506 int
 507 sobind(so, nam)
 508         struct socket *so;
 509         struct sockaddr *nam;
 510
 511 {
 512         struct proc *p = current_proc();
 513         int error = 0;
 514         struct socket_filter_entry      *filter;
 515         int                                                     filtered = 0;
 516
 517         socket_lock(so, 1);
 518
 519         /* Socket filter */
 520         error = 0;
 521         for (filter = so->so_filt; filter && (error == 0);
 522                  filter = filter->sfe_next_onsocket) {
 523                 if (filter->sfe_filter->sf_filter.sf_bind) {
 524                         if (filtered == 0) {
 525                                 filtered = 1;
 526                                 sflt_use(so);
 527                                 socket_unlock(so, 0);
 528                         }
 529                         error = filter->sfe_filter->sf_filter.sf_bind(
 530                                                 filter->sfe_cookie, so, nam);
 531                 }
 532         }
 533         if (filtered != 0) {
 534                 socket_lock(so, 0);
 535                 sflt_unuse(so);
 536         }
 537         /* End socket filter */
 538
 539         if (error == 0)
 540                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 541
 542         socket_unlock(so, 1);
 543
 544         if (error == EJUSTRETURN)
 545                 error = 0;
 546
 547         return (error);
 548 }
 549
 550 void
 551 sodealloc(so)
 552         struct socket *so;
 553 {
 554         so->so_gencnt = ++so_gencnt;
 555
 556 #ifndef __APPLE__
 557         if (so->so_rcv.sb_hiwat)
 558                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 559                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 560         if (so->so_snd.sb_hiwat)
 561                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 562                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 563 #ifdef INET
 564         if (so->so_accf != NULL) {
 565                 if (so->so_accf->so_accept_filter != NULL &&
 566                         so->so_accf->so_accept_filter->accf_destroy != NULL) {
 567                         so->so_accf->so_accept_filter->accf_destroy(so);
 568                 }
 569                 if (so->so_accf->so_accept_filter_str != NULL)
 570                         FREE(so->so_accf->so_accept_filter_str, M_ACCF);
 571                 FREE(so->so_accf, M_ACCF);
 572         }
 573 #endif /* INET */
 574         kauth_cred_rele(so->so_cred);
 575         zfreei(so->so_zone, so);
 576 #else
 577         if (so->cached_in_sock_layer == 1)
 578              cached_sock_free(so);
 579         else {
 580              if (so->cached_in_sock_layer == -1)
 581                         panic("sodealloc: double dealloc: so=%x\n", so);
 582              so->cached_in_sock_layer = -1;
 583              FREE_ZONE(so, sizeof(*so), so->so_zone);
 584         }
 585 #endif /* __APPLE__ */
 586 }
 587
 588 int
 589 solisten(so, backlog)
 590         register struct socket *so;
 591         int backlog;
 592
 593 {
 594         struct proc *p = current_proc();
 595         int error;
 596
 597         socket_lock(so, 1);
 598
 599         {
 600                 struct socket_filter_entry      *filter;
 601                 int                                                     filtered = 0;
 602                 error = 0;
 603                 for (filter = so->so_filt; filter && (error == 0);
 604                          filter = filter->sfe_next_onsocket) {
 605                         if (filter->sfe_filter->sf_filter.sf_listen) {
 606                                 if (filtered == 0) {
 607                                         filtered = 1;
 608                                         sflt_use(so);
 609                                         socket_unlock(so, 0);
 610                                 }
 611                                 error = filter->sfe_filter->sf_filter.sf_listen(
 612                                                         filter->sfe_cookie, so);
 613                         }
 614                 }
 615                 if (filtered != 0) {
 616                         socket_lock(so, 0);
 617                         sflt_unuse(so);
 618                 }
 619         }
 620
 621         if (error == 0) {
 622                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 623         }
 624
 625         if (error) {
 626                 socket_unlock(so, 1);
 627                 if (error == EJUSTRETURN)
 628                         error = 0;
 629                 return (error);
 630         }
 631
 632         if (TAILQ_EMPTY(&so->so_comp))
 633                 so->so_options |= SO_ACCEPTCONN;
 634         if (backlog < 0 || backlog > somaxconn)
 635                 backlog = somaxconn;
 636         so->so_qlimit = backlog;
 637
 638         socket_unlock(so, 1);
 639         return (0);
 640 }
 641
 642 void
 643 sofreelastref(so, dealloc)
 644         register struct socket *so;
 645         int dealloc;
 646 {
 647         int error;
 648         struct socket *head = so->so_head;
 649
 650         /*### Assume socket is locked */
 651
 652         /* Remove any filters - may be called more than once */
 653         sflt_termsock(so);
 654
 655         if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) {
 656 #ifdef __APPLE__
 657                 selthreadclear(&so->so_snd.sb_sel);
 658                 selthreadclear(&so->so_rcv.sb_sel);
 659                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 660                 so->so_snd.sb_flags &= ~SB_UPCALL;
 661 #endif
 662                 return;
 663         }
 664         if (head != NULL) {
 665                 socket_lock(head, 1);
 666                 if (so->so_state & SS_INCOMP) {
 667                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 668                         head->so_incqlen--;
 669                 } else if (so->so_state & SS_COMP) {
 670                         /*
 671                          * We must not decommission a socket that's
 672                          * on the accept(2) queue.  If we do, then
 673                          * accept(2) may hang after select(2) indicated
 674                          * that the listening socket was ready.
 675                          */
 676 #ifdef __APPLE__
 677                         selthreadclear(&so->so_snd.sb_sel);
 678                         selthreadclear(&so->so_rcv.sb_sel);
 679                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 680                         so->so_snd.sb_flags &= ~SB_UPCALL;
 681 #endif
 682                         socket_unlock(head, 1);
 683                         return;
 684                 } else {
 685                         panic("sofree: not queued");
 686                 }
 687                 head->so_qlen--;
 688                 so->so_state &= ~SS_INCOMP;
 689                 so->so_head = NULL;
 690                 socket_unlock(head, 1);
 691         }
 692 #ifdef __APPLE__
 693         selthreadclear(&so->so_snd.sb_sel);
 694         sbrelease(&so->so_snd);
 695 #endif
 696         sorflush(so);
 697
 698         /* 3932268: disable upcall */
 699         so->so_rcv.sb_flags &= ~SB_UPCALL;
 700         so->so_snd.sb_flags &= ~SB_UPCALL;
 701
 702         if (dealloc)
 703                 sodealloc(so);
 704 }
 705
 706 /*
 707  * Close a socket on last file table reference removal.
 708  * Initiate disconnect if connected.
 709  * Free socket when disconnect complete.
 710  */
 711 int
 712 soclose_locked(so)
 713         register struct socket *so;
 714 {
 715         int error = 0;
 716         lck_mtx_t * mutex_held;
 717         struct timespec ts;
 718
 719         if (so->so_usecount == 0) {
 720                 panic("soclose: so=%x refcount=0\n", so);
 721         }
 722
 723         sflt_notify(so, sock_evt_closing, NULL);
 724
 725         if ((so->so_options & SO_ACCEPTCONN)) {
 726                 struct socket *sp;
 727
 728                 /* We do not want new connection to be added to the connection queues */
 729                 so->so_options &= ~SO_ACCEPTCONN;
 730
 731                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 732                         /* A bit tricky here. We need to keep
 733                          * a lock if it's a protocol global lock
 734                          * but we want the head, not the socket locked
 735                          * in the case of per-socket lock...
 736                          */
 737                         if (so->so_proto->pr_getlock != NULL) {
 738                                 socket_unlock(so, 0);
 739                                 socket_lock(sp, 1);
 740                         }
 741                         (void) soabort(sp);
 742                         if (so->so_proto->pr_getlock != NULL) {
 743                                 socket_unlock(sp, 1);
 744                                 socket_lock(so, 0);
 745                         }
 746                 }
 747
 748                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 749                         /* Dequeue from so_comp since sofree() won't do it */
 750                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 751                         so->so_qlen--;
 752
 753                         if (so->so_proto->pr_getlock != NULL) {
 754                                 socket_unlock(so, 0);
 755                                 socket_lock(sp, 1);
 756                         }
 757
 758                         sp->so_state &= ~SS_COMP;
 759                         sp->so_head = NULL;
 760
 761                         (void) soabort(sp);
 762                         if (so->so_proto->pr_getlock != NULL) {
 763                                 socket_unlock(sp, 1);
 764                                 socket_lock(so, 0);
 765                         }
 766                 }
 767         }
 768         if (so->so_pcb == 0) {
 769                 /* 3915887: mark the socket as ready for dealloc */
 770                 so->so_flags |= SOF_PCBCLEARING;
 771                 goto discard;
 772         }
 773         if (so->so_state & SS_ISCONNECTED) {
 774                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 775                         error = sodisconnectlocked(so);
 776                         if (error)
 777                                 goto drop;
 778                 }
 779                 if (so->so_options & SO_LINGER) {
 780                         if ((so->so_state & SS_ISDISCONNECTING) &&
 781                             (so->so_state & SS_NBIO))
 782                                 goto drop;
 783                         if (so->so_proto->pr_getlock != NULL)
 784                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 785                         else
 786                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 787                         while (so->so_state & SS_ISCONNECTED) {
 788                                 ts.tv_sec = (so->so_linger/100);
 789                                 ts.tv_nsec = (so->so_linger % 100) * NSEC_PER_USEC * 1000 * 10;
 790                                 error = msleep((caddr_t)&so->so_timeo, mutex_held,
 791                                     PSOCK | PCATCH, "soclos", &ts);
 792                                 if (error) {
 793                                         /* It's OK when the time fires, don't report an error */
 794                                         if (error == EWOULDBLOCK)
 795                                                 error = 0;
 796                                         break;
 797                                 }
 798                         }
 799                 }
 800         }
 801 drop:
 802         if (so->so_usecount == 0)
 803                 panic("soclose: usecount is zero so=%x\n", so);
 804         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
 805                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
 806                 if (error == 0)
 807                         error = error2;
 808         }
 809         if (so->so_usecount <= 0)
 810                 panic("soclose: usecount is zero so=%x\n", so);
 811 discard:
 812         if (so->so_pcb && so->so_state & SS_NOFDREF)
 813                 panic("soclose: NOFDREF");
 814         so->so_state |= SS_NOFDREF;
 815 #ifdef __APPLE__
 816         so->so_proto->pr_domain->dom_refs--;
 817         evsofree(so);
 818 #endif
 819         so->so_usecount--;
 820         sofree(so);
 821         return (error);
 822 }
 823
 824 int
 825 soclose(so)
 826         register struct socket *so;
 827 {
 828         int error = 0;
 829         socket_lock(so, 1);
 830         if (so->so_retaincnt == 0)
 831                 error = soclose_locked(so);
 832         else {  /* if the FD is going away, but socket is retained in kernel remove its reference */
 833                 so->so_usecount--;
 834                 if (so->so_usecount < 2)
 835                         panic("soclose: retaincnt non null and so=%x usecount=%x\n", so->so_usecount);
 836         }
 837         socket_unlock(so, 1);
 838         return (error);
 839 }
 840
 841
 842 /*
 843  * Must be called at splnet...
 844  */
 845 //#### Should already be locked
 846 int
 847 soabort(so)
 848         struct socket *so;
 849 {
 850         int error;
 851
 852 #ifdef MORE_LOCKING_DEBUG
 853         lck_mtx_t * mutex_held;
 854
 855         if (so->so_proto->pr_getlock != NULL)
 856                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 857         else
 858                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 859         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 860 #endif
 861
 862         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
 863         if (error) {
 864                 sofree(so);
 865                 return error;
 866         }
 867         return (0);
 868 }
 869
 870 int
 871 soacceptlock(so, nam, dolock)
 872         register struct socket *so;
 873         struct sockaddr **nam;
 874         int dolock;
 875 {
 876         int error;
 877
 878         if (dolock) socket_lock(so, 1);
 879
 880         if ((so->so_state & SS_NOFDREF) == 0)
 881                 panic("soaccept: !NOFDREF");
 882         so->so_state &= ~SS_NOFDREF;
 883         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 884
 885         if (dolock) socket_unlock(so, 1);
 886         return (error);
 887 }
 888 int
 889 soaccept(so, nam)
 890         register struct socket *so;
 891         struct sockaddr **nam;
 892 {
 893         return (soacceptlock(so, nam, 1));
 894 }
 895
 896 int
 897 soconnectlock(so, nam, dolock)
 898         register struct socket *so;
 899         struct sockaddr *nam;
 900         int dolock;
 901
 902 {
 903         int s;
 904         int error;
 905         struct proc *p = current_proc();
 906
 907         if (dolock) socket_lock(so, 1);
 908
 909         if (so->so_options & SO_ACCEPTCONN) {
 910                 if (dolock) socket_unlock(so, 1);
 911                 return (EOPNOTSUPP);
 912         }
 913         /*
 914          * If protocol is connection-based, can only connect once.
 915          * Otherwise, if connected, try to disconnect first.
 916          * This allows user to disconnect by connecting to, e.g.,
 917          * a null address.
 918          */
 919         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 920             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 921             (error = sodisconnectlocked(so))))
 922                 error = EISCONN;
 923         else {
 924                 /*
 925                  * Run connect filter before calling protocol:
 926                  *  - non-blocking connect returns before completion;
 927                  */
 928                 {
 929                         struct socket_filter_entry      *filter;
 930                         int                                                     filtered = 0;
 931                         error = 0;
 932                         for (filter = so->so_filt; filter && (error == 0);
 933                                  filter = filter->sfe_next_onsocket) {
 934                                 if (filter->sfe_filter->sf_filter.sf_connect_out) {
 935                                         if (filtered == 0) {
 936                                                 filtered = 1;
 937                                                 sflt_use(so);
 938                                                 socket_unlock(so, 0);
 939                                         }
 940                                         error = filter->sfe_filter->sf_filter.sf_connect_out(
 941                                                                 filter->sfe_cookie, so, nam);
 942                                 }
 943                         }
 944                         if (filtered != 0) {
 945                                 socket_lock(so, 0);
 946                                 sflt_unuse(so);
 947                         }
 948                 }
 949                 if (error) {
 950                         if (error == EJUSTRETURN)
 951                                 error = 0;
 952                         if (dolock) socket_unlock(so, 1);
 953                         return error;
 954                 }
 955
 956                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
 957         }
 958         if (dolock) socket_unlock(so, 1);
 959         return (error);
 960 }
 961
 962 int
 963 soconnect(so, nam)
 964         register struct socket *so;
 965         struct sockaddr *nam;
 966 {
 967         return (soconnectlock(so, nam, 1));
 968 }
 969
 970 int
 971 soconnect2(so1, so2)
 972         register struct socket *so1;
 973         struct socket *so2;
 974 {
 975         int error;
 976 //####### Assumes so1 is already locked /
 977
 978         socket_lock(so2, 1);
 979
 980         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 981
 982         socket_unlock(so2, 1);
 983         return (error);
 984 }
 985
 986
 987 int
 988 sodisconnectlocked(so)
 989         register struct socket *so;
 990 {
 991         int error;
 992
 993         if ((so->so_state & SS_ISCONNECTED) == 0) {
 994                 error = ENOTCONN;
 995                 goto bad;
 996         }
 997         if (so->so_state & SS_ISDISCONNECTING) {
 998                 error = EALREADY;
 999                 goto bad;
1000         }
1001
1002         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1003
1004         if (error == 0) {
1005                 sflt_notify(so, sock_evt_disconnected, NULL);
1006         }
1007
1008 bad:
1009         return (error);
1010 }
1011 //### Locking version
1012 int
1013 sodisconnect(so)
1014         register struct socket *so;
1015 {
1016         int error;
1017
1018         socket_lock(so, 1);
1019         error = sodisconnectlocked(so);
1020         socket_unlock(so, 1);
1021         return(error);
1022 }
1023
1024 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1025
1026 /*
1027  * sosendcheck will lock the socket buffer if it isn't locked and
1028  * verify that there is space for the data being inserted.
1029  */
1030
1031 static int
1032 sosendcheck(
1033         struct socket *so,
1034         struct sockaddr *addr,
1035         long resid,
1036         long clen,
1037         long atomic,
1038         int flags,
1039         int *sblocked)
1040 {
1041         int error = 0;
1042         long space;
1043         int     assumelock = 0;
1044
1045 restart:
1046         if (*sblocked == 0) {
1047                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1048                         so->so_send_filt_thread != 0 &&
1049                         so->so_send_filt_thread == current_thread()) {
1050                         /*
1051                          * We're being called recursively from a filter,
1052                          * allow this to continue. Radar 4150520.
1053                          * Don't set sblocked because we don't want
1054                          * to perform an unlock later.
1055                          */
1056                         assumelock = 1;
1057                 }
1058                 else {
1059                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1060                         if (error) {
1061                                 return error;
1062                         }
1063                         *sblocked = 1;
1064                 }
1065         }
1066
1067         if (so->so_state & SS_CANTSENDMORE)
1068                 return EPIPE;
1069
1070         if (so->so_error) {
1071                 error = so->so_error;
1072                 so->so_error = 0;
1073                 return error;
1074         }
1075
1076         if ((so->so_state & SS_ISCONNECTED) == 0) {
1077                 /*
1078                  * `sendto' and `sendmsg' is allowed on a connection-
1079                  * based socket if it supports implied connect.
1080                  * Return ENOTCONN if not connected and no address is
1081                  * supplied.
1082                  */
1083                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1084                         (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1085                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1086                                 !(resid == 0 && clen != 0))
1087                                 return ENOTCONN;
1088                 } else if (addr == 0 && !(flags&MSG_HOLD))
1089                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ? ENOTCONN : EDESTADDRREQ;
1090         }
1091         space = sbspace(&so->so_snd);
1092         if (flags & MSG_OOB)
1093                 space += 1024;
1094         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1095                 clen > so->so_snd.sb_hiwat)
1096                 return EMSGSIZE;
1097         if (space < resid + clen &&
1098                 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1099                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || assumelock) {
1100                         return EWOULDBLOCK;
1101                 }
1102                 sbunlock(&so->so_snd, 1);
1103                 error = sbwait(&so->so_snd);
1104                 if (error) {
1105                         return error;
1106                 }
1107                 goto restart;
1108         }
1109
1110         return 0;
1111 }
1112
1113 /*
1114  * Send on a socket.
1115  * If send must go all at once and message is larger than
1116  * send buffering, then hard error.
1117  * Lock against other senders.
1118  * If must go all at once and not enough room now, then
1119  * inform user that this would block and do nothing.
1120  * Otherwise, if nonblocking, send as much as possible.
1121  * The data to be sent is described by "uio" if nonzero,
1122  * otherwise by the mbuf chain "top" (which must be null
1123  * if uio is not).  Data provided in mbuf chain must be small
1124  * enough to send all at once.
1125  *
1126  * Returns nonzero on error, timeout or signal; callers
1127  * must check for short counts if EINTR/ERESTART are returned.
1128  * Data and control buffers are freed on return.
1129  * Experiment:
1130  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1131  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1132  *  point at the mbuf chain being constructed and go from there.
1133  */
1134 int
1135 sosend(so, addr, uio, top, control, flags)
1136         register struct socket *so;
1137         struct sockaddr *addr;
1138         struct uio *uio;
1139         struct mbuf *top;
1140         struct mbuf *control;
1141         int flags;
1142
1143 {
1144         struct mbuf **mp;
1145         register struct mbuf *m, *freelist = NULL;
1146         register long space, len, resid;
1147         int clen = 0, error, dontroute, mlen, sendflags;
1148         int atomic = sosendallatonce(so) || top;
1149         int sblocked = 0;
1150         struct proc *p = current_proc();
1151
1152         if (uio)
1153                 // LP64todo - fix this!
1154                 resid = uio_resid(uio);
1155         else
1156                 resid = top->m_pkthdr.len;
1157
1158         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START),
1159                      so,
1160                      resid,
1161                      so->so_snd.sb_cc,
1162                      so->so_snd.sb_lowat,
1163                      so->so_snd.sb_hiwat);
1164
1165         socket_lock(so, 1);
1166
1167         /*
1168          * In theory resid should be unsigned.
1169          * However, space must be signed, as it might be less than 0
1170          * if we over-committed, and we must use a signed comparison
1171          * of space and resid.  On the other hand, a negative resid
1172          * causes us to loop sending 0-length segments to the protocol.
1173          *
1174          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1175          * type sockets since that's an error.
1176          */
1177         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1178                 error = EINVAL;
1179                 socket_unlock(so, 1);
1180                 goto out;
1181         }
1182
1183         dontroute =
1184             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1185             (so->so_proto->pr_flags & PR_ATOMIC);
1186         if (p)
1187                 p->p_stats->p_ru.ru_msgsnd++;
1188         if (control)
1189                 clen = control->m_len;
1190
1191         do {
1192                 error = sosendcheck(so, addr, resid, clen, atomic, flags, &sblocked);
1193                 if (error) {
1194                         goto release;
1195                 }
1196                 mp = &top;
1197                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ? 1024 : 0);
1198
1199                 do {
1200
1201                     if (uio == NULL) {
1202                                 /*
1203                                  * Data is prepackaged in "top".
1204                                  */
1205                                 resid = 0;
1206                                 if (flags & MSG_EOR)
1207                                         top->m_flags |= M_EOR;
1208                         } else {
1209                                 int             chainlength;
1210                                 int             bytes_to_copy;
1211
1212                                 bytes_to_copy = min(resid, space);
1213
1214                                 if (sosendminchain > 0) {
1215                                         chainlength = 0;
1216                                 } else
1217                                         chainlength = sosendmaxchain;
1218
1219                                 socket_unlock(so, 0);
1220
1221                                 do {
1222                                         int num_needed;
1223                                         int hdrs_needed = (top == 0) ? 1 : 0;
1224
1225                                         /*
1226                                          * try to maintain a local cache of mbuf clusters needed to complete this write
1227                                          * the list is further limited to the number that are currently needed to fill the socket
1228                                          * this mechanism allows a large number of mbufs/clusters to be grabbed under a single
1229                                          * mbuf lock... if we can't get any clusters, than fall back to trying for mbufs
1230                                          * if we fail early (or miscalcluate the number needed) make sure to release any clusters
1231                                          * we haven't yet consumed.
1232                                          */
1233                                         if (freelist == NULL && bytes_to_copy > MCLBYTES) {
1234                                                 num_needed = bytes_to_copy / NBPG;
1235
1236                                                 if ((bytes_to_copy - (num_needed * NBPG)) >= MINCLSIZE)
1237                                                         num_needed++;
1238
1239                                                 freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, NBPG);
1240                                                 /* Fall back to cluster size if allocation failed */
1241                                         }
1242
1243                                         if (freelist == NULL && bytes_to_copy > MINCLSIZE) {
1244                                                 num_needed = bytes_to_copy / MCLBYTES;
1245
1246                                                 if ((bytes_to_copy - (num_needed * MCLBYTES)) >= MINCLSIZE)
1247                                                         num_needed++;
1248
1249                                                 freelist = m_getpackets_internal(&num_needed, hdrs_needed, M_WAIT, 0, MCLBYTES);
1250                                                 /* Fall back to a single mbuf if allocation failed */
1251                                         }
1252
1253                                         if (freelist == NULL) {
1254                                                 if (top == 0)
1255                                                         MGETHDR(freelist, M_WAIT, MT_DATA);
1256                                                 else
1257                                                         MGET(freelist, M_WAIT, MT_DATA);
1258
1259                                                 if (freelist == NULL) {
1260                                                         error = ENOBUFS;
1261                                                         socket_lock(so, 0);
1262                                                         goto release;
1263                                                 }
1264                                                 /*
1265                                                  * For datagram protocols, leave room
1266                                                  * for protocol headers in first mbuf.
1267                                                  */
1268                                                 if (atomic && top == 0 && bytes_to_copy < MHLEN)
1269                                                         MH_ALIGN(freelist, bytes_to_copy);
1270                                         }
1271                                         m = freelist;
1272                                         freelist = m->m_next;
1273                                         m->m_next = NULL;
1274
1275                                         if ((m->m_flags & M_EXT))
1276                                                 mlen = m->m_ext.ext_size;
1277                                         else if ((m->m_flags & M_PKTHDR))
1278                                                 mlen = MHLEN - m_leadingspace(m);
1279                                         else
1280                                                 mlen = MLEN;
1281                                         len = min(mlen, bytes_to_copy);
1282
1283                                         chainlength += len;
1284
1285                                         space -= len;
1286
1287                                         error = uiomove(mtod(m, caddr_t), (int)len, uio);
1288
1289                                         // LP64todo - fix this!
1290                                         resid = uio_resid(uio);
1291
1292                                         m->m_len = len;
1293                                         *mp = m;
1294                                         top->m_pkthdr.len += len;
1295                                         if (error)
1296                                                 break;
1297                                         mp = &m->m_next;
1298                                         if (resid <= 0) {
1299                                                 if (flags & MSG_EOR)
1300                                                         top->m_flags |= M_EOR;
1301                                                 break;
1302                                         }
1303                                         bytes_to_copy = min(resid, space);
1304
1305                                 } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE));
1306
1307                                 socket_lock(so, 0);
1308
1309                                 if (error)
1310                                         goto release;
1311                         }
1312
1313                     if (flags & (MSG_HOLD|MSG_SEND))
1314                     {
1315                                 /* Enqueue for later, go away if HOLD */
1316                                 register struct mbuf *mb1;
1317                                 if (so->so_temp && (flags & MSG_FLUSH))
1318                                 {
1319                                         m_freem(so->so_temp);
1320                                         so->so_temp = NULL;
1321                                 }
1322                                 if (so->so_temp)
1323                                         so->so_tail->m_next = top;
1324                                 else
1325                                         so->so_temp = top;
1326                                 mb1 = top;
1327                                 while (mb1->m_next)
1328                                                 mb1 = mb1->m_next;
1329                                 so->so_tail = mb1;
1330                                 if (flags & MSG_HOLD)
1331                                 {
1332                                         top = NULL;
1333                                         goto release;
1334                                 }
1335                                 top = so->so_temp;
1336                     }
1337                     if (dontroute)
1338                             so->so_options |= SO_DONTROUTE;
1339                     /* Compute flags here, for pru_send and NKEs */
1340                     sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1341                         /*
1342                          * If the user set MSG_EOF, the protocol
1343                          * understands this flag and nothing left to
1344                          * send then use PRU_SEND_EOF instead of PRU_SEND.
1345                          */
1346                         ((flags & MSG_EOF) &&
1347                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1348                          (resid <= 0)) ?
1349                                 PRUS_EOF :
1350                         /* If there is more to send set PRUS_MORETOCOME */
1351                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1352
1353                         /*
1354                          * Socket filter processing
1355                          */
1356                         {
1357                                 struct socket_filter_entry *filter;
1358                                 int                                                     filtered;
1359
1360                                 filtered = 0;
1361                                 error = 0;
1362                                 for (filter = so->so_filt; filter && (error == 0);
1363                                          filter = filter->sfe_next_onsocket) {
1364                                         if (filter->sfe_filter->sf_filter.sf_data_out) {
1365                                                 int so_flags = 0;
1366                                                 if (filtered == 0) {
1367                                                         filtered = 1;
1368                                                         so->so_send_filt_thread = current_thread();
1369                                                         sflt_use(so);
1370                                                         socket_unlock(so, 0);
1371                                                         so_flags = (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0;
1372                                                 }
1373                                                 error = filter->sfe_filter->sf_filter.sf_data_out(
1374                                                                         filter->sfe_cookie, so, addr, &top, &control, so_flags);
1375                                         }
1376                                 }
1377
1378                                 if (filtered) {
1379                                         /*
1380                                          * At this point, we've run at least one filter.
1381                                          * The socket is unlocked as is the socket buffer.
1382                                          */
1383                                         socket_lock(so, 0);
1384                                         sflt_unuse(so);
1385                                         so->so_send_filt_thread = 0;
1386                                         if (error) {
1387                                                 if (error == EJUSTRETURN) {
1388                                                         error = 0;
1389                                                         clen = 0;
1390                                                         control = 0;
1391                                                         top = 0;
1392                                                 }
1393
1394                                                 goto release;
1395                                         }
1396                                 }
1397                         }
1398                         /*
1399                          * End Socket filter processing
1400                          */
1401
1402                         if (error == EJUSTRETURN) {
1403                                 /* A socket filter handled this data */
1404                                 error = 0;
1405                         }
1406                         else {
1407                                 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1408                                                         sendflags, top, addr, control, p);
1409                         }
1410 #ifdef __APPLE__
1411                     if (flags & MSG_SEND)
1412                         so->so_temp = NULL;
1413 #endif
1414                     if (dontroute)
1415                             so->so_options &= ~SO_DONTROUTE;
1416                     clen = 0;
1417                     control = 0;
1418                     top = 0;
1419                     mp = &top;
1420                     if (error)
1421                         goto release;
1422                 } while (resid && space > 0);
1423         } while (resid);
1424
1425 release:
1426         if (sblocked)
1427                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1428         else
1429                 socket_unlock(so, 1);
1430 out:
1431         if (top)
1432                 m_freem(top);
1433         if (control)
1434                 m_freem(control);
1435         if (freelist)
1436                 m_freem_list(freelist);
1437
1438         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END,
1439                      so,
1440                      resid,
1441                      so->so_snd.sb_cc,
1442                      space,
1443                      error);
1444
1445         return (error);
1446 }
1447
1448 /*
1449  * Implement receive operations on a socket.
1450  * We depend on the way that records are added to the sockbuf
1451  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1452  * must begin with an address if the protocol so specifies,
1453  * followed by an optional mbuf or mbufs containing ancillary data,
1454  * and then zero or more mbufs of data.
1455  * In order to avoid blocking network interrupts for the entire time here,
1456  * we splx() while doing the actual copy to user space.
1457  * Although the sockbuf is locked, new data may still be appended,
1458  * and thus we must maintain consistency of the sockbuf during that time.
1459  *
1460  * The caller may receive the data as a single mbuf chain by supplying
1461  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1462  * only for the count in uio_resid.
1463  */
1464 int
1465 soreceive(so, psa, uio, mp0, controlp, flagsp)
1466         register struct socket *so;
1467         struct sockaddr **psa;
1468         struct uio *uio;
1469         struct mbuf **mp0;
1470         struct mbuf **controlp;
1471         int *flagsp;
1472 {
1473         register struct mbuf *m, **mp, *ml = NULL;
1474         register int flags, len, error, offset;
1475         struct protosw *pr = so->so_proto;
1476         struct mbuf *nextrecord;
1477         int moff, type = 0;
1478                 // LP64todo - fix this!
1479         int orig_resid = uio_resid(uio);
1480         volatile struct mbuf *free_list;
1481         volatile int delayed_copy_len;
1482         int can_delay;
1483         int need_event;
1484         struct proc *p = current_proc();
1485
1486
1487                 // LP64todo - fix this!
1488         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START,
1489                      so,
1490                      uio_resid(uio),
1491                      so->so_rcv.sb_cc,
1492                      so->so_rcv.sb_lowat,
1493                      so->so_rcv.sb_hiwat);
1494
1495         socket_lock(so, 1);
1496
1497 #ifdef MORE_LOCKING_DEBUG
1498         if (so->so_usecount == 1)
1499                 panic("soreceive: so=%x no other reference on socket\n", so);
1500 #endif
1501         mp = mp0;
1502         if (psa)
1503                 *psa = 0;
1504         if (controlp)
1505                 *controlp = 0;
1506         if (flagsp)
1507                 flags = *flagsp &~ MSG_EOR;
1508         else
1509                 flags = 0;
1510         /*
1511          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1512          * regardless of the flags argument. Here is the case were
1513          * out-of-band data is not inline.
1514          */
1515         if ((flags & MSG_OOB) ||
1516             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1517              (so->so_options & SO_OOBINLINE) == 0 &&
1518              (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1519                 m = m_get(M_WAIT, MT_DATA);
1520                 if (m == NULL) {
1521                         socket_unlock(so, 1);
1522                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, ENOBUFS,0,0,0,0);
1523                         return (ENOBUFS);
1524                 }
1525                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1526                 if (error)
1527                         goto bad;
1528                 socket_unlock(so, 0);
1529                 do {
1530                 // LP64todo - fix this!
1531                         error = uiomove(mtod(m, caddr_t),
1532                             (int) min(uio_resid(uio), m->m_len), uio);
1533                         m = m_free(m);
1534                 } while (uio_resid(uio) && error == 0 && m);
1535                 socket_lock(so, 0);
1536 bad:
1537                 if (m)
1538                         m_freem(m);
1539 #ifdef __APPLE__
1540                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1541                         if (error == EWOULDBLOCK || error == EINVAL) {
1542                                 /*
1543                                  * Let's try to get normal data:
1544                                  *  EWOULDBLOCK: out-of-band data not receive yet;
1545                                  *  EINVAL: out-of-band data already read.
1546                                  */
1547                                 error = 0;
1548                                 goto nooob;
1549                         } else if (error == 0 && flagsp)
1550                                 *flagsp |= MSG_OOB;
1551                 }
1552                 socket_unlock(so, 1);
1553                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1554 #endif
1555                 return (error);
1556         }
1557 nooob:
1558         if (mp)
1559                 *mp = (struct mbuf *)0;
1560         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
1561                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1562
1563
1564         free_list = (struct mbuf *)0;
1565         delayed_copy_len = 0;
1566 restart:
1567 #ifdef MORE_LOCKING_DEBUG
1568         if (so->so_usecount <= 1)
1569                 printf("soreceive: sblock so=%x ref=%d on socket\n", so, so->so_usecount);
1570 #endif
1571         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1572         if (error) {
1573                 socket_unlock(so, 1);
1574                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1575                 return (error);
1576         }
1577
1578         m = so->so_rcv.sb_mb;
1579         /*
1580          * If we have less data than requested, block awaiting more
1581          * (subject to any timeout) if:
1582          *   1. the current count is less than the low water mark, or
1583          *   2. MSG_WAITALL is set, and it is possible to do the entire
1584          *      receive operation at once if we block (resid <= hiwat).
1585          *   3. MSG_DONTWAIT is not set
1586          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1587          * we have to do the receive in sections, and thus risk returning
1588          * a short count if a timeout or signal occurs after we start.
1589          */
1590         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
1591             so->so_rcv.sb_cc < uio_resid(uio)) &&
1592            (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1593             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
1594             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
1595
1596                 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
1597                 if (so->so_error) {
1598                         if (m)
1599                                 goto dontblock;
1600                         error = so->so_error;
1601                         if ((flags & MSG_PEEK) == 0)
1602                                 so->so_error = 0;
1603                         goto release;
1604                 }
1605                 if (so->so_state & SS_CANTRCVMORE) {
1606                         if (m)
1607                                 goto dontblock;
1608                         else
1609                                 goto release;
1610                 }
1611                 for (; m; m = m->m_next)
1612                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1613                                 m = so->so_rcv.sb_mb;
1614                                 goto dontblock;
1615                         }
1616                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1617                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1618                         error = ENOTCONN;
1619                         goto release;
1620                 }
1621                 if (uio_resid(uio) == 0)
1622                         goto release;
1623                 if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1624                         error = EWOULDBLOCK;
1625                         goto release;
1626                 }
1627                 sbunlock(&so->so_rcv, 1);
1628 #ifdef EVEN_MORE_LOCKING_DEBUG
1629                 if (socket_debug)
1630                     printf("Waiting for socket data\n");
1631 #endif
1632
1633                 error = sbwait(&so->so_rcv);
1634 #ifdef EVEN_MORE_LOCKING_DEBUG
1635                 if (socket_debug)
1636                     printf("SORECEIVE - sbwait returned %d\n", error);
1637 #endif
1638                 if (so->so_usecount < 1)
1639                         panic("soreceive: after 2nd sblock so=%x ref=%d on socket\n", so, so->so_usecount);
1640                 if (error) {
1641                         socket_unlock(so, 1);
1642                     KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1643                     return (error);
1644                 }
1645                 goto restart;
1646         }
1647 dontblock:
1648 #ifndef __APPLE__
1649         if (uio->uio_procp)
1650                 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
1651 #else   /* __APPLE__ */
1652         /*
1653          * 2207985
1654          * This should be uio->uio-procp; however, some callers of this
1655          * function use auto variables with stack garbage, and fail to
1656          * fill out the uio structure properly.
1657          */
1658         if (p)
1659                 p->p_stats->p_ru.ru_msgrcv++;
1660 #endif  /* __APPLE__ */
1661         nextrecord = m->m_nextpkt;
1662         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
1663                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1664                 orig_resid = 0;
1665                 if (psa) {
1666                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
1667                                             mp0 == 0);
1668                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
1669                                 error = EWOULDBLOCK;
1670                                 goto release;
1671                         }
1672                 }
1673                 if (flags & MSG_PEEK) {
1674                         m = m->m_next;
1675                 } else {
1676                         sbfree(&so->so_rcv, m);
1677                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
1678                                 panic("soreceive: about to create invalid socketbuf");
1679                         MFREE(m, so->so_rcv.sb_mb);
1680                         m = so->so_rcv.sb_mb;
1681                 }
1682         }
1683         while (m && m->m_type == MT_CONTROL && error == 0) {
1684                 if (flags & MSG_PEEK) {
1685                         if (controlp)
1686                                 *controlp = m_copy(m, 0, m->m_len);
1687                         m = m->m_next;
1688                 } else {
1689                         sbfree(&so->so_rcv, m);
1690                         if (controlp) {
1691                                 if (pr->pr_domain->dom_externalize &&
1692                                     mtod(m, struct cmsghdr *)->cmsg_type ==
1693                                     SCM_RIGHTS) {
1694                                    socket_unlock(so, 0); /* release socket lock: see 3903171 */
1695                                    error = (*pr->pr_domain->dom_externalize)(m);
1696                                    socket_lock(so, 0);
1697                                 }
1698                                 *controlp = m;
1699                                 if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
1700                                         panic("soreceive: so->so_rcv.sb_mb->m_next == 0 && so->so_rcv.sb_cc != 0");
1701                                 so->so_rcv.sb_mb = m->m_next;
1702                                 m->m_next = 0;
1703                                 m = so->so_rcv.sb_mb;
1704                         } else {
1705                                 MFREE(m, so->so_rcv.sb_mb);
1706                                 m = so->so_rcv.sb_mb;
1707                         }
1708                 }
1709                 if (controlp) {
1710                         orig_resid = 0;
1711                         controlp = &(*controlp)->m_next;
1712                 }
1713         }
1714         if (m) {
1715                 if ((flags & MSG_PEEK) == 0)
1716                         m->m_nextpkt = nextrecord;
1717                 type = m->m_type;
1718                 if (type == MT_OOBDATA)
1719                         flags |= MSG_OOB;
1720         }
1721         moff = 0;
1722         offset = 0;
1723
1724         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
1725                 can_delay = 1;
1726         else
1727                 can_delay = 0;
1728
1729         need_event = 0;
1730
1731         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1732                 if (m->m_type == MT_OOBDATA) {
1733                         if (type != MT_OOBDATA)
1734                                 break;
1735                 } else if (type == MT_OOBDATA)
1736                         break;
1737 #ifndef __APPLE__
1738 /*
1739  * This assertion needs rework.  The trouble is Appletalk is uses many
1740  * mbuf types (NOT listed in mbuf.h!) which will trigger this panic.
1741  * For now just remove the assertion...  CSM 9/98
1742  */
1743                 else
1744                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1745                         ("receive 3"));
1746 #else
1747                 /*
1748                  * Make sure to allways set MSG_OOB event when getting
1749                  * out of band data inline.
1750                  */
1751                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1752                         (so->so_options & SO_OOBINLINE) != 0 &&
1753                         (so->so_state & SS_RCVATMARK) != 0) {
1754                         flags |= MSG_OOB;
1755                 }
1756 #endif
1757                 so->so_state &= ~SS_RCVATMARK;
1758                 // LP64todo - fix this!
1759                 len = uio_resid(uio) - delayed_copy_len;
1760                 if (so->so_oobmark && len > so->so_oobmark - offset)
1761                         len = so->so_oobmark - offset;
1762                 if (len > m->m_len - moff)
1763                         len = m->m_len - moff;
1764                 /*
1765                  * If mp is set, just pass back the mbufs.
1766                  * Otherwise copy them out via the uio, then free.
1767                  * Sockbuf must be consistent here (points to current mbuf,
1768                  * it points to next record) when we drop priority;
1769                  * we must note any additions to the sockbuf when we
1770                  * block interrupts again.
1771                  */
1772                 if (mp == 0) {
1773                         if (can_delay && len == m->m_len) {
1774                                 /*
1775                                  * only delay the copy if we're consuming the
1776                                  * mbuf and we're NOT in MSG_PEEK mode
1777                                  * and we have enough data to make it worthwile
1778                                  * to drop and retake the funnel... can_delay
1779                                  * reflects the state of the 2 latter constraints
1780                                  * moff should always be zero in these cases
1781                                  */
1782                                 delayed_copy_len += len;
1783                         } else {
1784
1785                                 if (delayed_copy_len) {
1786                                         error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1787
1788                                         if (error) {
1789                                                 goto release;
1790                                         }
1791                                         if (m != so->so_rcv.sb_mb) {
1792                                                 /*
1793                                                  * can only get here if MSG_PEEK is not set
1794                                                  * therefore, m should point at the head of the rcv queue...
1795                                                  * if it doesn't, it means something drastically changed
1796                                                  * while we were out from behind the funnel in sodelayed_copy...
1797                                                  * perhaps a RST on the stream... in any event, the stream has
1798                                                  * been interrupted... it's probably best just to return
1799                                                  * whatever data we've moved and let the caller sort it out...
1800                                                  */
1801                                                 break;
1802                                         }
1803                                 }
1804                                 socket_unlock(so, 0);
1805                                 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
1806                                 socket_lock(so, 0);
1807
1808                                 if (error)
1809                                         goto release;
1810                         }
1811                 } else
1812                         uio_setresid(uio, (uio_resid(uio) - len));
1813
1814                 if (len == m->m_len - moff) {
1815                         if (m->m_flags & M_EOR)
1816                                 flags |= MSG_EOR;
1817                         if (flags & MSG_PEEK) {
1818                                 m = m->m_next;
1819                                 moff = 0;
1820                         } else {
1821                                 nextrecord = m->m_nextpkt;
1822                                 sbfree(&so->so_rcv, m);
1823                                 m->m_nextpkt = NULL;
1824
1825                                 if (mp) {
1826                                         *mp = m;
1827                                         mp = &m->m_next;
1828                                         so->so_rcv.sb_mb = m = m->m_next;
1829                                         *mp = (struct mbuf *)0;
1830                                 } else {
1831                                         if (free_list == NULL)
1832                                             free_list = m;
1833                                         else
1834                                             ml->m_next = m;
1835                                         ml = m;
1836                                         so->so_rcv.sb_mb = m = m->m_next;
1837                                         ml->m_next = 0;
1838                                 }
1839                                 if (m)
1840                                         m->m_nextpkt = nextrecord;
1841                         }
1842                 } else {
1843                         if (flags & MSG_PEEK)
1844                                 moff += len;
1845                         else {
1846                                 if (mp)
1847                                         *mp = m_copym(m, 0, len, M_WAIT);
1848                                 m->m_data += len;
1849                                 m->m_len -= len;
1850                                 so->so_rcv.sb_cc -= len;
1851                         }
1852                 }
1853                 if (so->so_oobmark) {
1854                         if ((flags & MSG_PEEK) == 0) {
1855                                 so->so_oobmark -= len;
1856                                 if (so->so_oobmark == 0) {
1857                                     so->so_state |= SS_RCVATMARK;
1858                                     /*
1859                                      * delay posting the actual event until after
1860                                      * any delayed copy processing has finished
1861                                      */
1862                                     need_event = 1;
1863                                     break;
1864                                 }
1865                         } else {
1866                                 offset += len;
1867                                 if (offset == so->so_oobmark)
1868                                         break;
1869                         }
1870                 }
1871                 if (flags & MSG_EOR)
1872                         break;
1873                 /*
1874                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set (for non-atomic socket),
1875                  * we must not quit until "uio->uio_resid == 0" or an error
1876                  * termination.  If a signal/timeout occurs, return
1877                  * with a short count but without error.
1878                  * Keep sockbuf locked against other readers.
1879                  */
1880                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 && (uio_resid(uio) - delayed_copy_len) > 0 &&
1881                     !sosendallatonce(so) && !nextrecord) {
1882                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
1883                                 goto release;
1884
1885                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb && (((struct inpcb *)so->so_pcb)->inp_state != INPCB_STATE_DEAD))
1886                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1887                         if (sbwait(&so->so_rcv)) {
1888                                 error = 0;
1889                                 goto release;
1890                         }
1891                         /*
1892                          * have to wait until after we get back from the sbwait to do the copy because
1893                          * we will drop the funnel if we have enough data that has been delayed... by dropping
1894                          * the funnel we open up a window allowing the netisr thread to process the incoming packets
1895                          * and to change the state of this socket... we're issuing the sbwait because
1896                          * the socket is empty and we're expecting the netisr thread to wake us up when more
1897                          * packets arrive... if we allow that processing to happen and then sbwait, we
1898                          * could stall forever with packets sitting in the socket if no further packets
1899                          * arrive from the remote side.
1900                          *
1901                          * we want to copy before we've collected all the data to satisfy this request to
1902                          * allow the copy to overlap the incoming packet processing on an MP system
1903                          */
1904                         if (delayed_copy_len > sorecvmincopy && (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
1905
1906                                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1907
1908                                 if (error)
1909                                         goto release;
1910                         }
1911                         m = so->so_rcv.sb_mb;
1912                         if (m) {
1913                                 nextrecord = m->m_nextpkt;
1914                         }
1915                 }
1916         }
1917 #ifdef MORE_LOCKING_DEBUG
1918         if (so->so_usecount <= 1)
1919                 panic("soreceive: after big while so=%x ref=%d on socket\n", so, so->so_usecount);
1920 #endif
1921
1922         if (m && pr->pr_flags & PR_ATOMIC) {
1923 #ifdef __APPLE__
1924                 if (so->so_options & SO_DONTTRUNC)
1925                         flags |= MSG_RCVMORE;
1926                 else {
1927 #endif
1928                         flags |= MSG_TRUNC;
1929                         if ((flags & MSG_PEEK) == 0)
1930                                 (void) sbdroprecord(&so->so_rcv);
1931 #ifdef __APPLE__
1932                 }
1933 #endif
1934         }
1935         if ((flags & MSG_PEEK) == 0) {
1936                 if (m == 0)
1937                         so->so_rcv.sb_mb = nextrecord;
1938                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1939                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1940         }
1941 #ifdef __APPLE__
1942         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
1943                 flags |= MSG_HAVEMORE;
1944
1945         if (delayed_copy_len) {
1946                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1947
1948                 if (error)
1949                         goto release;
1950         }
1951         if (free_list) {
1952                 m_freem_list((struct mbuf *)free_list);
1953                 free_list = (struct mbuf *)0;
1954         }
1955         if (need_event)
1956                 postevent(so, 0, EV_OOB);
1957 #endif
1958         if (orig_resid == uio_resid(uio) && orig_resid &&
1959             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1960                 sbunlock(&so->so_rcv, 1);
1961                 goto restart;
1962         }
1963
1964         if (flagsp)
1965                 *flagsp |= flags;
1966 release:
1967 #ifdef MORE_LOCKING_DEBUG
1968         if (so->so_usecount <= 1)
1969                 panic("soreceive: release so=%x ref=%d on socket\n", so, so->so_usecount);
1970 #endif
1971         if (delayed_copy_len) {
1972                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1973         }
1974         if (free_list) {
1975                 m_freem_list((struct mbuf *)free_list);
1976         }
1977         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
1978
1979                 // LP64todo - fix this!
1980         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1981                      so,
1982                      uio_resid(uio),
1983                      so->so_rcv.sb_cc,
1984                      0,
1985                      error);
1986
1987         return (error);
1988 }
1989
1990
1991 static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, int *resid)
1992 {
1993         int         error  = 0;
1994         struct mbuf *m;
1995
1996         m = *free_list;
1997
1998         socket_unlock(so, 0);
1999
2000         while (m && error == 0) {
2001
2002                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2003
2004                 m = m->m_next;
2005         }
2006         m_freem_list(*free_list);
2007
2008         *free_list = (struct mbuf *)NULL;
2009         *resid = 0;
2010
2011         socket_lock(so, 0);
2012
2013         return (error);
2014 }
2015
2016
2017 int
2018 soshutdown(so, how)
2019         register struct socket *so;
2020         register int how;
2021 {
2022         register struct protosw *pr = so->so_proto;
2023         int ret;
2024
2025         socket_lock(so, 1);
2026
2027         sflt_notify(so, sock_evt_shutdown, &how);
2028
2029         if (how != SHUT_WR) {
2030                 sorflush(so);
2031                 postevent(so, 0, EV_RCLOSED);
2032         }
2033         if (how != SHUT_RD) {
2034             ret = ((*pr->pr_usrreqs->pru_shutdown)(so));
2035             postevent(so, 0, EV_WCLOSED);
2036             KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
2037                 socket_unlock(so, 1);
2038             return(ret);
2039         }
2040
2041         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
2042         socket_unlock(so, 1);
2043         return (0);
2044 }
2045
2046 void
2047 sorflush(so)
2048         register struct socket *so;
2049 {
2050         register struct sockbuf *sb = &so->so_rcv;
2051         register struct protosw *pr = so->so_proto;
2052         struct sockbuf asb;
2053
2054 #ifdef MORE_LOCKING_DEBUG
2055         lck_mtx_t * mutex_held;
2056
2057         if (so->so_proto->pr_getlock != NULL)
2058                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2059         else
2060                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2061         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2062 #endif
2063
2064         sflt_notify(so, sock_evt_flush_read, NULL);
2065
2066         sb->sb_flags |= SB_NOINTR;
2067         (void) sblock(sb, M_WAIT);
2068         socantrcvmore(so);
2069         sbunlock(sb, 1);
2070 #ifdef __APPLE__
2071         selthreadclear(&sb->sb_sel);
2072 #endif
2073         asb = *sb;
2074         bzero((caddr_t)sb, sizeof (*sb));
2075         sb->sb_so = so; /* reestablish link to socket */
2076         if (asb.sb_flags & SB_KNOTE) {
2077                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2078                 sb->sb_flags = SB_KNOTE;
2079         }
2080         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
2081                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2082         sbrelease(&asb);
2083 }
2084
2085 /*
2086  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2087  * an additional variant to handle the case where the option value needs
2088  * to be some kind of integer, but not a specific size.
2089  * In addition to their use here, these functions are also called by the
2090  * protocol-level pr_ctloutput() routines.
2091  */
2092 int
2093 sooptcopyin(sopt, buf, len, minlen)
2094         struct  sockopt *sopt;
2095         void    *buf;
2096         size_t  len;
2097         size_t  minlen;
2098 {
2099         size_t  valsize;
2100
2101         /*
2102          * If the user gives us more than we wanted, we ignore it,
2103          * but if we don't get the minimum length the caller
2104          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2105          * is set to however much we actually retrieved.
2106          */
2107         if ((valsize = sopt->sopt_valsize) < minlen)
2108                 return EINVAL;
2109         if (valsize > len)
2110                 sopt->sopt_valsize = valsize = len;
2111
2112         if (sopt->sopt_p != 0)
2113                 return (copyin(sopt->sopt_val, buf, valsize));
2114
2115         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2116         return 0;
2117 }
2118
2119 int
2120 sosetopt(so, sopt)
2121         struct socket *so;
2122         struct sockopt *sopt;
2123 {
2124         int     error, optval;
2125         struct  linger l;
2126         struct  timeval tv;
2127         short   val;
2128
2129         socket_lock(so, 1);
2130
2131         if (sopt->sopt_dir != SOPT_SET) {
2132                 sopt->sopt_dir = SOPT_SET;
2133         }
2134
2135         {
2136                 struct socket_filter_entry      *filter;
2137                 int                                                     filtered = 0;
2138                 error = 0;
2139                 for (filter = so->so_filt; filter && (error == 0);
2140                          filter = filter->sfe_next_onsocket) {
2141                         if (filter->sfe_filter->sf_filter.sf_setoption) {
2142                                 if (filtered == 0) {
2143                                         filtered = 1;
2144                                         sflt_use(so);
2145                                         socket_unlock(so, 0);
2146                                 }
2147                                 error = filter->sfe_filter->sf_filter.sf_setoption(
2148                                                         filter->sfe_cookie, so, sopt);
2149                         }
2150                 }
2151
2152                 if (filtered != 0) {
2153                         socket_lock(so, 0);
2154                         sflt_unuse(so);
2155
2156                         if (error) {
2157                                 if (error == EJUSTRETURN)
2158                                         error = 0;
2159                                 goto bad;
2160                         }
2161                 }
2162         }
2163
2164         error = 0;
2165         if (sopt->sopt_level != SOL_SOCKET) {
2166                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2167                         error = (*so->so_proto->pr_ctloutput)
2168                                   (so, sopt);
2169                         socket_unlock(so, 1);
2170                         return (error);
2171                 }
2172                 error = ENOPROTOOPT;
2173         } else {
2174                 switch (sopt->sopt_name) {
2175                 case SO_LINGER:
2176                 case SO_LINGER_SEC:
2177                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2178                         if (error)
2179                                 goto bad;
2180
2181                         so->so_linger = (sopt->sopt_name == SO_LINGER) ? l.l_linger : l.l_linger * hz;
2182                         if (l.l_onoff)
2183                                 so->so_options |= SO_LINGER;
2184                         else
2185                                 so->so_options &= ~SO_LINGER;
2186                         break;
2187
2188                 case SO_DEBUG:
2189                 case SO_KEEPALIVE:
2190                 case SO_DONTROUTE:
2191                 case SO_USELOOPBACK:
2192                 case SO_BROADCAST:
2193                 case SO_REUSEADDR:
2194                 case SO_REUSEPORT:
2195                 case SO_OOBINLINE:
2196                 case SO_TIMESTAMP:
2197 #ifdef __APPLE__
2198                 case SO_DONTTRUNC:
2199                 case SO_WANTMORE:
2200                 case SO_WANTOOBFLAG:
2201 #endif
2202                         error = sooptcopyin(sopt, &optval, sizeof optval,
2203                                             sizeof optval);
2204                         if (error)
2205                                 goto bad;
2206                         if (optval)
2207                                 so->so_options |= sopt->sopt_name;
2208                         else
2209                                 so->so_options &= ~sopt->sopt_name;
2210                         break;
2211
2212                 case SO_SNDBUF:
2213                 case SO_RCVBUF:
2214                 case SO_SNDLOWAT:
2215                 case SO_RCVLOWAT:
2216                         error = sooptcopyin(sopt, &optval, sizeof optval,
2217                                             sizeof optval);
2218                         if (error)
2219                                 goto bad;
2220
2221                         /*
2222                          * Values < 1 make no sense for any of these
2223                          * options, so disallow them.
2224                          */
2225                         if (optval < 1) {
2226                                 error = EINVAL;
2227                                 goto bad;
2228                         }
2229
2230                         switch (sopt->sopt_name) {
2231                         case SO_SNDBUF:
2232                         case SO_RCVBUF:
2233                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2234                                               &so->so_snd : &so->so_rcv,
2235                                               (u_long) optval) == 0) {
2236                                         error = ENOBUFS;
2237                                         goto bad;
2238                                 }
2239                                 break;
2240
2241                         /*
2242                          * Make sure the low-water is never greater than
2243                          * the high-water.
2244                          */
2245                         case SO_SNDLOWAT:
2246                                 so->so_snd.sb_lowat =
2247                                     (optval > so->so_snd.sb_hiwat) ?
2248                                     so->so_snd.sb_hiwat : optval;
2249                                 break;
2250                         case SO_RCVLOWAT:
2251                                 so->so_rcv.sb_lowat =
2252                                     (optval > so->so_rcv.sb_hiwat) ?
2253                                     so->so_rcv.sb_hiwat : optval;
2254                                 break;
2255                         }
2256                         break;
2257
2258                 case SO_SNDTIMEO:
2259                 case SO_RCVTIMEO:
2260                         error = sooptcopyin(sopt, &tv, sizeof tv,
2261                                             sizeof tv);
2262                         if (error)
2263                                 goto bad;
2264
2265                         if (tv.tv_sec < 0 || tv.tv_sec > LONG_MAX ||
2266                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2267                                 error = EDOM;
2268                                 goto bad;
2269                         }
2270
2271                         switch (sopt->sopt_name) {
2272                         case SO_SNDTIMEO:
2273                                 so->so_snd.sb_timeo = tv;
2274                                 break;
2275                         case SO_RCVTIMEO:
2276                                 so->so_rcv.sb_timeo = tv;
2277                                 break;
2278                         }
2279                         break;
2280
2281                 case SO_NKE:
2282                 {
2283                         struct so_nke nke;
2284
2285                         error = sooptcopyin(sopt, &nke,
2286                                                                 sizeof nke, sizeof nke);
2287                         if (error)
2288                           goto bad;
2289
2290                         error = sflt_attach_private(so, NULL, nke.nke_handle, 1);
2291                         break;
2292                 }
2293
2294                 case SO_NOSIGPIPE:
2295                         error = sooptcopyin(sopt, &optval, sizeof optval,
2296                                             sizeof optval);
2297                         if (error)
2298                                 goto bad;
2299                         if (optval)
2300                                 so->so_flags |= SOF_NOSIGPIPE;
2301                         else
2302                                 so->so_flags &= ~SOF_NOSIGPIPE;
2303
2304                         break;
2305
2306                 case SO_NOADDRERR:
2307                         error = sooptcopyin(sopt, &optval, sizeof optval,
2308                                             sizeof optval);
2309                         if (error)
2310                                 goto bad;
2311                         if (optval)
2312                                 so->so_flags |= SOF_NOADDRAVAIL;
2313                         else
2314                                 so->so_flags &= ~SOF_NOADDRAVAIL;
2315
2316                         break;
2317
2318                 default:
2319                         error = ENOPROTOOPT;
2320                         break;
2321                 }
2322                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
2323                         (void) ((*so->so_proto->pr_ctloutput)
2324                                   (so, sopt));
2325                 }
2326         }
2327 bad:
2328         socket_unlock(so, 1);
2329         return (error);
2330 }
2331
2332 /* Helper routine for getsockopt */
2333 int
2334 sooptcopyout(sopt, buf, len)
2335         struct  sockopt *sopt;
2336         void    *buf;
2337         size_t  len;
2338 {
2339         int     error;
2340         size_t  valsize;
2341
2342         error = 0;
2343
2344         /*
2345          * Documented get behavior is that we always return a value,
2346          * possibly truncated to fit in the user's buffer.
2347          * Traditional behavior is that we always tell the user
2348          * precisely how much we copied, rather than something useful
2349          * like the total amount we had available for her.
2350          * Note that this interface is not idempotent; the entire answer must
2351          * generated ahead of time.
2352          */
2353         valsize = min(len, sopt->sopt_valsize);
2354         sopt->sopt_valsize = valsize;
2355         if (sopt->sopt_val != USER_ADDR_NULL) {
2356                 if (sopt->sopt_p != 0)
2357                         error = copyout(buf, sopt->sopt_val, valsize);
2358                 else
2359                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
2360         }
2361         return error;
2362 }
2363
2364 int
2365 sogetopt(so, sopt)
2366         struct socket *so;
2367         struct sockopt *sopt;
2368 {
2369         int     error, optval;
2370         struct  linger l;
2371         struct  timeval tv;
2372
2373         if (sopt->sopt_dir != SOPT_GET) {
2374                 sopt->sopt_dir = SOPT_GET;
2375         }
2376
2377         socket_lock(so, 1);
2378
2379         {
2380                 struct socket_filter_entry      *filter;
2381                 int                                                     filtered = 0;
2382                 error = 0;
2383                 for (filter = so->so_filt; filter && (error == 0);
2384                          filter = filter->sfe_next_onsocket) {
2385                         if (filter->sfe_filter->sf_filter.sf_getoption) {
2386                                 if (filtered == 0) {
2387                                         filtered = 1;
2388                                         sflt_use(so);
2389                                         socket_unlock(so, 0);
2390                                 }
2391                                 error = filter->sfe_filter->sf_filter.sf_getoption(
2392                                                         filter->sfe_cookie, so, sopt);
2393                         }
2394                 }
2395                 if (filtered != 0) {
2396                         socket_lock(so, 0);
2397                         sflt_unuse(so);
2398
2399                         if (error) {
2400                                 if (error == EJUSTRETURN)
2401                                         error = 0;
2402                                 socket_unlock(so, 1);
2403                                 return error;
2404                         }
2405                 }
2406         }
2407
2408         error = 0;
2409         if (sopt->sopt_level != SOL_SOCKET) {
2410                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2411                         error = (*so->so_proto->pr_ctloutput)
2412                                   (so, sopt);
2413                         socket_unlock(so, 1);
2414                         return (error);
2415                 } else {
2416                         socket_unlock(so, 1);
2417                         return (ENOPROTOOPT);
2418                 }
2419         } else {
2420                 switch (sopt->sopt_name) {
2421                 case SO_LINGER:
2422                 case SO_LINGER_SEC:
2423                         l.l_onoff = so->so_options & SO_LINGER;
2424                         l.l_linger = (sopt->sopt_name == SO_LINGER) ? so->so_linger :
2425                                 so->so_linger / hz;
2426                         error = sooptcopyout(sopt, &l, sizeof l);
2427                         break;
2428
2429                 case SO_USELOOPBACK:
2430                 case SO_DONTROUTE:
2431                 case SO_DEBUG:
2432                 case SO_KEEPALIVE:
2433                 case SO_REUSEADDR:
2434                 case SO_REUSEPORT:
2435                 case SO_BROADCAST:
2436                 case SO_OOBINLINE:
2437                 case SO_TIMESTAMP:
2438 #ifdef __APPLE__
2439                 case SO_DONTTRUNC:
2440                 case SO_WANTMORE:
2441                 case SO_WANTOOBFLAG:
2442 #endif
2443                         optval = so->so_options & sopt->sopt_name;
2444 integer:
2445                         error = sooptcopyout(sopt, &optval, sizeof optval);
2446                         break;
2447
2448                 case SO_TYPE:
2449                         optval = so->so_type;
2450                         goto integer;
2451
2452 #ifdef __APPLE__
2453                 case SO_NREAD:
2454                 {
2455                         int pkt_total;
2456                         struct mbuf *m1;
2457
2458                         pkt_total = 0;
2459                         m1 = so->so_rcv.sb_mb;
2460                         if (so->so_proto->pr_flags & PR_ATOMIC)
2461                         {
2462                                 while (m1) {
2463                                         if (m1->m_type == MT_DATA)
2464                                                 pkt_total += m1->m_len;
2465                                         m1 = m1->m_next;
2466                                 }
2467                                 optval = pkt_total;
2468                         } else
2469                                 optval = so->so_rcv.sb_cc;
2470                         goto integer;
2471                 }
2472                 case SO_NWRITE:
2473                         optval = so->so_snd.sb_cc;
2474                         goto integer;
2475 #endif
2476                 case SO_ERROR:
2477                         optval = so->so_error;
2478                         so->so_error = 0;
2479                         goto integer;
2480
2481                 case SO_SNDBUF:
2482                         optval = so->so_snd.sb_hiwat;
2483                         goto integer;
2484
2485                 case SO_RCVBUF:
2486                         optval = so->so_rcv.sb_hiwat;
2487                         goto integer;
2488
2489                 case SO_SNDLOWAT:
2490                         optval = so->so_snd.sb_lowat;
2491                         goto integer;
2492
2493                 case SO_RCVLOWAT:
2494                         optval = so->so_rcv.sb_lowat;
2495                         goto integer;
2496
2497                 case SO_SNDTIMEO:
2498                 case SO_RCVTIMEO:
2499                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
2500                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2501
2502                         error = sooptcopyout(sopt, &tv, sizeof tv);
2503                         break;
2504
2505                 case SO_NOSIGPIPE:
2506                         optval = (so->so_flags & SOF_NOSIGPIPE);
2507                         goto integer;
2508
2509                 case SO_NOADDRERR:
2510                         optval = (so->so_flags & SOF_NOADDRAVAIL);
2511                         goto integer;
2512
2513                 default:
2514                         error = ENOPROTOOPT;
2515                         break;
2516                 }
2517                 socket_unlock(so, 1);
2518                 return (error);
2519         }
2520 }
2521
2522 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2523 int
2524 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2525 {
2526         struct mbuf *m, *m_prev;
2527         int sopt_size = sopt->sopt_valsize;
2528
2529         if (sopt_size > MAX_SOOPTGETM_SIZE)
2530                 return EMSGSIZE;
2531
2532         MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2533         if (m == 0)
2534                 return ENOBUFS;
2535         if (sopt_size > MLEN) {
2536                 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2537                 if ((m->m_flags & M_EXT) == 0) {
2538                         m_free(m);
2539                         return ENOBUFS;
2540                 }
2541                 m->m_len = min(MCLBYTES, sopt_size);
2542         } else {
2543                 m->m_len = min(MLEN, sopt_size);
2544         }
2545         sopt_size -= m->m_len;
2546         *mp = m;
2547         m_prev = m;
2548
2549         while (sopt_size) {
2550                 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2551                 if (m == 0) {
2552                         m_freem(*mp);
2553                         return ENOBUFS;
2554                 }
2555                 if (sopt_size > MLEN) {
2556                         MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2557                         if ((m->m_flags & M_EXT) == 0) {
2558                                 m_freem(*mp);
2559                                 return ENOBUFS;
2560                         }
2561                         m->m_len = min(MCLBYTES, sopt_size);
2562                 } else {
2563                         m->m_len = min(MLEN, sopt_size);
2564                 }
2565                 sopt_size -= m->m_len;
2566                 m_prev->m_next = m;
2567                 m_prev = m;
2568         }
2569         return 0;
2570 }
2571
2572 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2573 int
2574 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2575 {
2576         struct mbuf *m0 = m;
2577
2578         if (sopt->sopt_val == USER_ADDR_NULL)
2579                 return 0;
2580         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2581                 if (sopt->sopt_p != NULL) {
2582                         int error;
2583
2584                         error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len);
2585                         if (error != 0) {
2586                                 m_freem(m0);
2587                                 return(error);
2588                         }
2589                 } else
2590                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), mtod(m, char *), m->m_len);
2591                 sopt->sopt_valsize -= m->m_len;
2592                 sopt->sopt_val += m->m_len;
2593                 m = m->m_next;
2594         }
2595         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2596                 panic("soopt_mcopyin");
2597         return 0;
2598 }
2599
2600 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2601 int
2602 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2603 {
2604         struct mbuf *m0 = m;
2605         size_t valsize = 0;
2606
2607         if (sopt->sopt_val == USER_ADDR_NULL)
2608                 return 0;
2609         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2610                 if (sopt->sopt_p != NULL) {
2611                         int error;
2612
2613                         error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len);
2614                         if (error != 0) {
2615                                 m_freem(m0);
2616                                 return(error);
2617                         }
2618                 } else
2619                         bcopy(mtod(m, char *), CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
2620                sopt->sopt_valsize -= m->m_len;
2621                sopt->sopt_val += m->m_len;
2622                valsize += m->m_len;
2623                m = m->m_next;
2624         }
2625         if (m != NULL) {
2626                 /* enough soopt buffer should be given from user-land */
2627                 m_freem(m0);
2628                 return(EINVAL);
2629         }
2630         sopt->sopt_valsize = valsize;
2631         return 0;
2632 }
2633
2634 void
2635 sohasoutofband(so)
2636         register struct socket *so;
2637 {
2638         struct proc *p;
2639
2640         if (so->so_pgid < 0)
2641                 gsignal(-so->so_pgid, SIGURG);
2642         else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
2643                 psignal(p, SIGURG);
2644         selwakeup(&so->so_rcv.sb_sel);
2645 }
2646
2647 int
2648 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
2649 {
2650         struct proc *p = current_proc();
2651         int revents = 0;
2652
2653         socket_lock(so, 1);
2654
2655         if (events & (POLLIN | POLLRDNORM))
2656                 if (soreadable(so))
2657                         revents |= events & (POLLIN | POLLRDNORM);
2658
2659         if (events & (POLLOUT | POLLWRNORM))
2660                 if (sowriteable(so))
2661                         revents |= events & (POLLOUT | POLLWRNORM);
2662
2663         if (events & (POLLPRI | POLLRDBAND))
2664                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
2665                         revents |= events & (POLLPRI | POLLRDBAND);
2666
2667         if (revents == 0) {
2668                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2669                         /* Darwin sets the flag first, BSD calls selrecord first */
2670                         so->so_rcv.sb_flags |= SB_SEL;
2671                         selrecord(p, &so->so_rcv.sb_sel, wql);
2672                 }
2673
2674                 if (events & (POLLOUT | POLLWRNORM)) {
2675                         /* Darwin sets the flag first, BSD calls selrecord first */
2676                         so->so_snd.sb_flags |= SB_SEL;
2677                         selrecord(p, &so->so_snd.sb_sel, wql);
2678                 }
2679         }
2680
2681         socket_unlock(so, 1);
2682         return (revents);
2683 }
2684
2685 int     soo_kqfilter(struct fileproc *fp, struct knote *kn, struct proc *p);
2686
2687 int
2688 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p)
2689 {
2690         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2691         struct sockbuf *sb;
2692         socket_lock(so, 1);
2693
2694         switch (kn->kn_filter) {
2695         case EVFILT_READ:
2696                 if (so->so_options & SO_ACCEPTCONN)
2697                         kn->kn_fop = &solisten_filtops;
2698                 else
2699                         kn->kn_fop = &soread_filtops;
2700                 sb = &so->so_rcv;
2701                 break;
2702         case EVFILT_WRITE:
2703                 kn->kn_fop = &sowrite_filtops;
2704                 sb = &so->so_snd;
2705                 break;
2706         default:
2707                 socket_unlock(so, 1);
2708                 return (1);
2709         }
2710
2711         if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
2712                 sb->sb_flags |= SB_KNOTE;
2713         socket_unlock(so, 1);
2714         return (0);
2715 }
2716
2717 static void
2718 filt_sordetach(struct knote *kn)
2719 {
2720         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2721
2722         socket_lock(so, 1);
2723         if (so->so_rcv.sb_flags & SB_KNOTE)
2724                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
2725                         so->so_rcv.sb_flags &= ~SB_KNOTE;
2726         socket_unlock(so, 1);
2727 }
2728
2729 /*ARGSUSED*/
2730 static int
2731 filt_soread(struct knote *kn, long hint)
2732 {
2733         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2734
2735         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2736                 socket_lock(so, 1);
2737
2738         if (so->so_oobmark) {
2739                 if (kn->kn_flags & EV_OOBAND) {
2740                         kn->kn_data = so->so_rcv.sb_cc - so->so_oobmark;
2741                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2742                                 socket_unlock(so, 1);
2743                         return (1);
2744                 }
2745                 kn->kn_data = so->so_oobmark;
2746                 kn->kn_flags |= EV_OOBAND;
2747         } else {
2748                 kn->kn_data = so->so_rcv.sb_cc;
2749                 if (so->so_state & SS_CANTRCVMORE) {
2750                         kn->kn_flags |= EV_EOF;
2751                         kn->kn_fflags = so->so_error;
2752                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2753                                 socket_unlock(so, 1);
2754                         return (1);
2755                 }
2756         }
2757
2758         if (so->so_state & SS_RCVATMARK) {
2759                 if (kn->kn_flags & EV_OOBAND) {
2760                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2761                                 socket_unlock(so, 1);
2762                         return (1);
2763                 }
2764                 kn->kn_flags |= EV_OOBAND;
2765         } else if (kn->kn_flags & EV_OOBAND) {
2766                 kn->kn_data = 0;
2767                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2768                         socket_unlock(so, 1);
2769                 return (0);
2770         }
2771
2772         if (so->so_error) {     /* temporary udp error */
2773                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2774                         socket_unlock(so, 1);
2775                 return (1);
2776         }
2777
2778         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2779                 socket_unlock(so, 1);
2780
2781         return( kn->kn_flags & EV_OOBAND ||
2782                 kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
2783                                 kn->kn_sdata : so->so_rcv.sb_lowat));
2784 }
2785
2786 static void
2787 filt_sowdetach(struct knote *kn)
2788 {
2789         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2790         socket_lock(so, 1);
2791
2792         if(so->so_snd.sb_flags & SB_KNOTE)
2793                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
2794                         so->so_snd.sb_flags &= ~SB_KNOTE;
2795         socket_unlock(so, 1);
2796 }
2797
2798 /*ARGSUSED*/
2799 static int
2800 filt_sowrite(struct knote *kn, long hint)
2801 {
2802         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2803
2804         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2805                 socket_lock(so, 1);
2806
2807         kn->kn_data = sbspace(&so->so_snd);
2808         if (so->so_state & SS_CANTSENDMORE) {
2809                 kn->kn_flags |= EV_EOF;
2810                 kn->kn_fflags = so->so_error;
2811                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2812                         socket_unlock(so, 1);
2813                 return (1);
2814         }
2815         if (so->so_error) {     /* temporary udp error */
2816                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2817                         socket_unlock(so, 1);
2818                 return (1);
2819         }
2820         if (((so->so_state & SS_ISCONNECTED) == 0) &&
2821             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2822                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
2823                         socket_unlock(so, 1);
2824                 return (0);
2825         }
2826         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2827                 socket_unlock(so, 1);
2828         if (kn->kn_sfflags & NOTE_LOWAT)
2829                 return (kn->kn_data >= kn->kn_sdata);
2830         return (kn->kn_data >= so->so_snd.sb_lowat);
2831 }
2832
2833 /*ARGSUSED*/
2834 static int
2835 filt_solisten(struct knote *kn, long hint)
2836 {
2837         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
2838         int isempty;
2839
2840         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2841                 socket_lock(so, 1);
2842         kn->kn_data = so->so_qlen;
2843         isempty = ! TAILQ_EMPTY(&so->so_comp);
2844         if ((hint & SO_FILT_HINT_LOCKED) == 0)
2845                 socket_unlock(so, 1);
2846         return (isempty);
2847 }
2848
2849
2850 int
2851 socket_lock(so, refcount)
2852         struct socket *so;
2853         int refcount;
2854 {
2855         int error = 0, lr, lr_saved;
2856 #ifdef __ppc__
2857         __asm__ volatile("mflr %0" : "=r" (lr));
2858         lr_saved = lr;
2859 #endif
2860
2861         if (so->so_proto->pr_lock) {
2862                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2863         }
2864         else {
2865 #ifdef MORE_LOCKING_DEBUG
2866                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED);
2867 #endif
2868                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
2869                 if (refcount)
2870                         so->so_usecount++;
2871                 so->reserved3 = (void*)lr_saved; /* save caller for refcount going to zero */
2872         }
2873
2874         return(error);
2875
2876 }
2877
2878 int
2879 socket_unlock(so, refcount)
2880         struct socket *so;
2881         int refcount;
2882 {
2883         int error = 0, lr, lr_saved;
2884         lck_mtx_t * mutex_held;
2885
2886 #ifdef __ppc__
2887 __asm__ volatile("mflr %0" : "=r" (lr));
2888         lr_saved = lr;
2889 #endif
2890
2891
2892
2893         if (so->so_proto == NULL)
2894                 panic("socket_unlock null so_proto so=%x\n", so);
2895
2896         if (so && so->so_proto->pr_unlock)
2897                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2898         else {
2899                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2900 #ifdef MORE_LOCKING_DEBUG
2901                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2902 #endif
2903                 if (refcount) {
2904                         if (so->so_usecount <= 0)
2905                                 panic("socket_unlock: bad refcount so=%x value=%d\n", so, so->so_usecount);
2906                         so->so_usecount--;
2907                         if (so->so_usecount == 0) {
2908                                 sofreelastref(so, 1);
2909                         }
2910                         else
2911                                 so->reserved4 = (void*)lr_saved; /* save caller */
2912                 }
2913                 lck_mtx_unlock(mutex_held);
2914         }
2915
2916         return(error);
2917 }
2918 //### Called with socket locked, will unlock socket
2919 void
2920 sofree(so)
2921         struct socket *so;
2922 {
2923
2924         int lr, lr_saved;
2925         lck_mtx_t * mutex_held;
2926 #ifdef __ppc__
2927         __asm__ volatile("mflr %0" : "=r" (lr));
2928         lr_saved = lr;
2929 #endif
2930         if (so->so_proto->pr_getlock != NULL)
2931                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2932         else
2933                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2934         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2935
2936         sofreelastref(so, 0);
2937 }
2938
2939 void
2940 soreference(so)
2941         struct socket *so;
2942 {
2943         socket_lock(so, 1);     /* locks & take one reference on socket */
2944         socket_unlock(so, 0);   /* unlock only */
2945 }
2946
2947 void
2948 sodereference(so)
2949         struct socket *so;
2950 {
2951         socket_lock(so, 0);
2952         socket_unlock(so, 1);
2953 }