bsd/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
  23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  24 /*
  25  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  * 1. Redistributions of source code must retain the above copyright
  32  *    notice, this list of conditions and the following disclaimer.
  33  * 2. Redistributions in binary form must reproduce the above copyright
  34  *    notice, this list of conditions and the following disclaimer in the
  35  *    documentation and/or other materials provided with the distribution.
  36  * 3. All advertising materials mentioning features or use of this software
  37  *    must display the following acknowledgement:
  38  *      This product includes software developed by the University of
  39  *      California, Berkeley and its contributors.
  40  * 4. Neither the name of the University nor the names of its contributors
  41  *    may be used to endorse or promote products derived from this software
  42  *    without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  45  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  47  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  48  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  49  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  50  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  51  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  52  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  53  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54  * SUCH DAMAGE.
  55  *
  56  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  57  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
  58  */
  59
  60 #include <sys/param.h>
  61 #include <sys/systm.h>
  62 #include <sys/domain.h>
  63 #include <sys/kernel.h>
  64 #include <sys/proc_internal.h>
  65 #include <sys/kauth.h>
  66 #include <sys/malloc.h>
  67 #include <sys/mbuf.h>
  68 #include <sys/protosw.h>
  69 #include <sys/stat.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/signalvar.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/ev.h>
  75 #include <kern/locks.h>
  76 #include <net/route.h>
  77 #include <netinet/in.h>
  78 #include <netinet/in_pcb.h>
  79 #include <sys/kdebug.h>
  80
  81 #define DBG_FNC_SBDROP  NETDBG_CODE(DBG_NETSOCK, 4)
  82 #define DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
  83
  84 static int sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
  85
  86 /*
  87  * Primitive routines for operating on sockets and socket buffers
  88  */
  89
  90 u_long  sb_max = SB_MAX;                /* XXX should be static */
  91
  92 static  u_long sb_efficiency = 8;       /* parameter for sbreserve() */
  93
  94 /*
  95  * Procedures to manipulate state flags of socket
  96  * and do appropriate wakeups.  Normal sequence from the
  97  * active (originating) side is that soisconnecting() is
  98  * called during processing of connect() call,
  99  * resulting in an eventual call to soisconnected() if/when the
 100  * connection is established.  When the connection is torn down
 101  * soisdisconnecting() is called during processing of disconnect() call,
 102  * and soisdisconnected() is called when the connection to the peer
 103  * is totally severed.  The semantics of these routines are such that
 104  * connectionless protocols can call soisconnected() and soisdisconnected()
 105  * only, bypassing the in-progress calls when setting up a ``connection''
 106  * takes no time.
 107  *
 108  * From the passive side, a socket is created with
 109  * two queues of sockets: so_incomp for connections in progress
 110  * and so_comp for connections already made and awaiting user acceptance.
 111  * As a protocol is preparing incoming connections, it creates a socket
 112  * structure queued on so_incomp by calling sonewconn().  When the connection
 113  * is established, soisconnected() is called, and transfers the
 114  * socket structure to so_comp, making it available to accept().
 115  *
 116  * If a socket is closed with sockets on either
 117  * so_incomp or so_comp, these sockets are dropped.
 118  *
 119  * If higher level protocols are implemented in
 120  * the kernel, the wakeups done here will sometimes
 121  * cause software-interrupt process scheduling.
 122  */
 123 void
 124 soisconnecting(so)
 125         register struct socket *so;
 126 {
 127
 128         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 129         so->so_state |= SS_ISCONNECTING;
 130
 131         sflt_notify(so, sock_evt_connecting, NULL);
 132 }
 133
 134 void
 135 soisconnected(so)
 136         struct socket *so;
 137 {
 138         struct socket *head = so->so_head;
 139
 140         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 141         so->so_state |= SS_ISCONNECTED;
 142
 143         sflt_notify(so, sock_evt_connected, NULL);
 144
 145         if (head && (so->so_state & SS_INCOMP)) {
 146                 so->so_state &= ~SS_INCOMP;
 147                 so->so_state |= SS_COMP;
 148                 if (head->so_proto->pr_getlock != NULL) {
 149                         socket_unlock(so, 0);
 150                         socket_lock(head, 1);
 151                 }
 152                 postevent(head, 0, EV_RCONN);
 153                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 154                 head->so_incqlen--;
 155                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 156                 sorwakeup(head);
 157                 wakeup_one((caddr_t)&head->so_timeo);
 158                 if (head->so_proto->pr_getlock != NULL) {
 159                         socket_unlock(head, 1);
 160                         socket_lock(so, 0);
 161                 }
 162         } else {
 163                 postevent(so, 0, EV_WCONN);
 164                 wakeup((caddr_t)&so->so_timeo);
 165                 sorwakeup(so);
 166                 sowwakeup(so);
 167         }
 168 }
 169
 170 void
 171 soisdisconnecting(so)
 172         register struct socket *so;
 173 {
 174         so->so_state &= ~SS_ISCONNECTING;
 175         so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
 176         sflt_notify(so, sock_evt_disconnecting, NULL);
 177         wakeup((caddr_t)&so->so_timeo);
 178         sowwakeup(so);
 179         sorwakeup(so);
 180 }
 181
 182 void
 183 soisdisconnected(so)
 184         register struct socket *so;
 185 {
 186         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 187         so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
 188         sflt_notify(so, sock_evt_disconnected, NULL);
 189         wakeup((caddr_t)&so->so_timeo);
 190         sowwakeup(so);
 191         sorwakeup(so);
 192 }
 193
 194 /*
 195  * Return a random connection that hasn't been serviced yet and
 196  * is eligible for discard.  There is a one in qlen chance that
 197  * we will return a null, saying that there are no dropable
 198  * requests.  In this case, the protocol specific code should drop
 199  * the new request.  This insures fairness.
 200  *
 201  * This may be used in conjunction with protocol specific queue
 202  * congestion routines.
 203  */
 204 struct socket *
 205 sodropablereq(head)
 206         register struct socket *head;
 207 {
 208         struct socket *so, *sonext = NULL;
 209         unsigned int i, j, qlen;
 210         static int rnd;
 211         static struct timeval old_runtime;
 212         static unsigned int cur_cnt, old_cnt;
 213         struct timeval tv;
 214
 215         microtime(&tv);
 216         if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
 217                 old_runtime = tv;
 218                 old_cnt = cur_cnt / i;
 219                 cur_cnt = 0;
 220         }
 221
 222         so = TAILQ_FIRST(&head->so_incomp);
 223         if (!so)
 224                 return (NULL);
 225
 226         qlen = head->so_incqlen;
 227         if (++cur_cnt > qlen || old_cnt > qlen) {
 228                 rnd = (314159 * rnd + 66329) & 0xffff;
 229                 j = ((qlen + 1) * rnd) >> 16;
 230 //###LD To clean up
 231                 while (j-- && so) {
 232 //                      if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
 233                                 socket_lock(so, 1);
 234                                 sonext = TAILQ_NEXT(so, so_list);
 235 //                              in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
 236                                 socket_unlock(so, 1);
 237                                 so = sonext;
 238                 }
 239         }
 240
 241 //      if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 242 //              return (NULL);
 243 //      else
 244                 return (so);
 245 }
 246
 247 /*
 248  * When an attempt at a new connection is noted on a socket
 249  * which accepts connections, sonewconn is called.  If the
 250  * connection is possible (subject to space constraints, etc.)
 251  * then we allocate a new structure, propoerly linked into the
 252  * data structure of the original socket, and return this.
 253  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 254  */
 255 static struct socket *
 256 sonewconn_internal(head, connstatus)
 257         register struct socket *head;
 258         int connstatus;
 259 {
 260         int error = 0;
 261         register struct socket *so;
 262         lck_mtx_t *mutex_held;
 263
 264         if (head->so_proto->pr_getlock != NULL)
 265                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 266         else
 267                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 268         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 269
 270         if (head->so_qlen > 3 * head->so_qlimit / 2)
 271                 return ((struct socket *)0);
 272         so = soalloc(1, head->so_proto->pr_domain->dom_family, head->so_type);
 273         if (so == NULL)
 274                 return ((struct socket *)0);
 275         /* check if head was closed during the soalloc */
 276         if (head->so_proto == NULL) {
 277           sodealloc(so);
 278           return ((struct socket *)0);
 279         }
 280
 281         so->so_head = head;
 282         so->so_type = head->so_type;
 283         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 284         so->so_linger = head->so_linger;
 285         so->so_state = head->so_state | SS_NOFDREF;
 286         so->so_proto = head->so_proto;
 287         so->so_timeo = head->so_timeo;
 288         so->so_pgid  = head->so_pgid;
 289         so->so_uid = head->so_uid;
 290         so->so_usecount = 1;
 291         so->next_lock_lr = 0;
 292         so->next_unlock_lr = 0;
 293
 294 #ifdef __APPLE__
 295         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 296         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 297         TAILQ_INIT(&so->so_evlist);
 298 #endif
 299
 300         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 301                 sflt_termsock(so);
 302                 sodealloc(so);
 303                 return ((struct socket *)0);
 304         }
 305
 306         /*
 307          * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
 308          */
 309         if (head->so_proto->pr_unlock)
 310                 socket_unlock(head, 0);
 311         if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) || error) {
 312                 sflt_termsock(so);
 313                 sodealloc(so);
 314                 if (head->so_proto->pr_unlock)
 315                         socket_lock(head, 0);
 316                 return ((struct socket *)0);
 317         }
 318         if (head->so_proto->pr_unlock)
 319                 socket_lock(head, 0);
 320 #ifdef __APPLE__
 321         so->so_proto->pr_domain->dom_refs++;
 322 #endif
 323
 324         if (connstatus) {
 325                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 326                 so->so_state |= SS_COMP;
 327         } else {
 328                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 329                 so->so_state |= SS_INCOMP;
 330                 head->so_incqlen++;
 331         }
 332         head->so_qlen++;
 333
 334 #ifdef __APPLE__
 335         /* Attach socket filters for this protocol */
 336         sflt_initsock(so);
 337 #endif
 338         if (connstatus) {
 339                 so->so_state |= connstatus;
 340                 sorwakeup(head);
 341                 wakeup((caddr_t)&head->so_timeo);
 342         }
 343         return (so);
 344 }
 345
 346
 347 struct socket *
 348 sonewconn(
 349         struct socket *head,
 350         int connstatus,
 351         const struct sockaddr *from)
 352 {
 353         int error = 0;
 354         struct socket_filter_entry      *filter;
 355         int                                                     filtered = 0;
 356
 357         error = 0;
 358         for (filter = head->so_filt; filter && (error == 0);
 359                  filter = filter->sfe_next_onsocket) {
 360                 if (filter->sfe_filter->sf_filter.sf_connect_in) {
 361                         if (filtered == 0) {
 362                                 filtered = 1;
 363                                 sflt_use(head);
 364                                 socket_unlock(head, 0);
 365                         }
 366                         error = filter->sfe_filter->sf_filter.sf_connect_in(
 367                                                 filter->sfe_cookie, head, from);
 368                 }
 369         }
 370         if (filtered != 0) {
 371                 socket_lock(head, 0);
 372                 sflt_unuse(head);
 373         }
 374
 375         if (error) {
 376                 return NULL;
 377         }
 378
 379         return sonewconn_internal(head, connstatus);
 380 }
 381
 382 /*
 383  * Socantsendmore indicates that no more data will be sent on the
 384  * socket; it would normally be applied to a socket when the user
 385  * informs the system that no more data is to be sent, by the protocol
 386  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 387  * will be received, and will normally be applied to the socket by a
 388  * protocol when it detects that the peer will send no more data.
 389  * Data queued for reading in the socket may yet be read.
 390  */
 391
 392 void
 393 socantsendmore(so)
 394         struct socket *so;
 395 {
 396         so->so_state |= SS_CANTSENDMORE;
 397         sflt_notify(so, sock_evt_cantsendmore, NULL);
 398         sowwakeup(so);
 399 }
 400
 401 void
 402 socantrcvmore(so)
 403         struct socket *so;
 404 {
 405         so->so_state |= SS_CANTRCVMORE;
 406         sflt_notify(so, sock_evt_cantrecvmore, NULL);
 407         sorwakeup(so);
 408 }
 409
 410 /*
 411  * Wait for data to arrive at/drain from a socket buffer.
 412  */
 413 int
 414 sbwait(sb)
 415         struct sockbuf *sb;
 416 {
 417         int error = 0, lr_saved;
 418         struct socket *so = sb->sb_so;
 419         lck_mtx_t *mutex_held;
 420         struct timespec ts;
 421
 422         lr_saved = (unsigned int) __builtin_return_address(0);
 423
 424         if (so->so_proto->pr_getlock != NULL)
 425                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 426         else
 427                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 428
 429         sb->sb_flags |= SB_WAIT;
 430
 431         if (so->so_usecount < 1)
 432                 panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount);
 433         ts.tv_sec = sb->sb_timeo.tv_sec;
 434         ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
 435         error = msleep((caddr_t)&sb->sb_cc, mutex_held,
 436                 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 437                 &ts);
 438
 439         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 440
 441         if (so->so_usecount < 1)
 442                 panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount);
 443
 444         if ((so->so_state & SS_DRAINING)) {
 445                 error = EBADF;
 446         }
 447
 448         return (error);
 449 }
 450
 451 /*
 452  * Lock a sockbuf already known to be locked;
 453  * return any error returned from sleep (EINTR).
 454  */
 455 int
 456 sb_lock(sb)
 457         register struct sockbuf *sb;
 458 {
 459         struct socket *so = sb->sb_so;
 460         lck_mtx_t * mutex_held;
 461         int error = 0;
 462
 463         if (so == NULL)
 464                 panic("sb_lock: null so back pointer sb=%x\n", sb);
 465
 466         while (sb->sb_flags & SB_LOCK) {
 467                 sb->sb_flags |= SB_WANT;
 468                 if (so->so_proto->pr_getlock != NULL)
 469                         mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 470                 else
 471                         mutex_held = so->so_proto->pr_domain->dom_mtx;
 472                 if (so->so_usecount < 1)
 473                         panic("sb_lock: so=%x refcount=%d\n", so, so->so_usecount);
 474
 475                 error = msleep((caddr_t)&sb->sb_flags, mutex_held,
 476                         (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sblock", 0);
 477                 if (so->so_usecount < 1)
 478                         panic("sb_lock: 2 so=%x refcount=%d\n", so, so->so_usecount);
 479                 if (error)
 480                         return (error);
 481         }
 482         sb->sb_flags |= SB_LOCK;
 483         return (0);
 484 }
 485
 486 /*
 487  * Wakeup processes waiting on a socket buffer.
 488  * Do asynchronous notification via SIGIO
 489  * if the socket has the SS_ASYNC flag set.
 490  */
 491 void
 492 sowakeup(so, sb)
 493         register struct socket *so;
 494         register struct sockbuf *sb;
 495 {
 496         struct proc *p = current_proc();
 497         sb->sb_flags &= ~SB_SEL;
 498         selwakeup(&sb->sb_sel);
 499         if (sb->sb_flags & SB_WAIT) {
 500                 sb->sb_flags &= ~SB_WAIT;
 501                 wakeup((caddr_t)&sb->sb_cc);
 502         }
 503         if (so->so_state & SS_ASYNC) {
 504                 if (so->so_pgid < 0)
 505                         gsignal(-so->so_pgid, SIGIO);
 506                 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
 507                         psignal(p, SIGIO);
 508         }
 509         if (sb->sb_flags & SB_KNOTE) {
 510                 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
 511         }
 512         if (sb->sb_flags & SB_UPCALL) {
 513                 socket_unlock(so, 0);
 514                 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
 515                 socket_lock(so, 0);
 516         }
 517 }
 518
 519 /*
 520  * Socket buffer (struct sockbuf) utility routines.
 521  *
 522  * Each socket contains two socket buffers: one for sending data and
 523  * one for receiving data.  Each buffer contains a queue of mbufs,
 524  * information about the number of mbufs and amount of data in the
 525  * queue, and other fields allowing select() statements and notification
 526  * on data availability to be implemented.
 527  *
 528  * Data stored in a socket buffer is maintained as a list of records.
 529  * Each record is a list of mbufs chained together with the m_next
 530  * field.  Records are chained together with the m_nextpkt field. The upper
 531  * level routine soreceive() expects the following conventions to be
 532  * observed when placing information in the receive buffer:
 533  *
 534  * 1. If the protocol requires each message be preceded by the sender's
 535  *    name, then a record containing that name must be present before
 536  *    any associated data (mbuf's must be of type MT_SONAME).
 537  * 2. If the protocol supports the exchange of ``access rights'' (really
 538  *    just additional data associated with the message), and there are
 539  *    ``rights'' to be received, then a record containing this data
 540  *    should be present (mbuf's must be of type MT_RIGHTS).
 541  * 3. If a name or rights record exists, then it must be followed by
 542  *    a data record, perhaps of zero length.
 543  *
 544  * Before using a new socket structure it is first necessary to reserve
 545  * buffer space to the socket, by calling sbreserve().  This should commit
 546  * some of the available buffer space in the system buffer pool for the
 547  * socket (currently, it does nothing but enforce limits).  The space
 548  * should be released by calling sbrelease() when the socket is destroyed.
 549  */
 550
 551 int
 552 soreserve(so, sndcc, rcvcc)
 553         register struct socket *so;
 554         u_long sndcc, rcvcc;
 555 {
 556
 557         if (sbreserve(&so->so_snd, sndcc) == 0)
 558                 goto bad;
 559         if (sbreserve(&so->so_rcv, rcvcc) == 0)
 560                 goto bad2;
 561         if (so->so_rcv.sb_lowat == 0)
 562                 so->so_rcv.sb_lowat = 1;
 563         if (so->so_snd.sb_lowat == 0)
 564                 so->so_snd.sb_lowat = MCLBYTES;
 565         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 566                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 567         return (0);
 568 bad2:
 569 #ifdef __APPLE__
 570         selthreadclear(&so->so_snd.sb_sel);
 571 #endif
 572         sbrelease(&so->so_snd);
 573 bad:
 574         return (ENOBUFS);
 575 }
 576
 577 /*
 578  * Allot mbufs to a sockbuf.
 579  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 580  * if buffering efficiency is near the normal case.
 581  */
 582 int
 583 sbreserve(sb, cc)
 584         struct sockbuf *sb;
 585         u_long cc;
 586 {
 587         if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
 588                 return (0);
 589         sb->sb_hiwat = cc;
 590         sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 591         if (sb->sb_lowat > sb->sb_hiwat)
 592                 sb->sb_lowat = sb->sb_hiwat;
 593         return (1);
 594 }
 595
 596 /*
 597  * Free mbufs held by a socket, and reserved mbuf space.
 598  */
 599  /*  WARNING needs to do selthreadclear() before calling this */
 600 void
 601 sbrelease(sb)
 602         struct sockbuf *sb;
 603 {
 604
 605         sbflush(sb);
 606         sb->sb_hiwat = 0;
 607         sb->sb_mbmax = 0;
 608
 609 }
 610
 611 /*
 612  * Routines to add and remove
 613  * data from an mbuf queue.
 614  *
 615  * The routines sbappend() or sbappendrecord() are normally called to
 616  * append new mbufs to a socket buffer, after checking that adequate
 617  * space is available, comparing the function sbspace() with the amount
 618  * of data to be added.  sbappendrecord() differs from sbappend() in
 619  * that data supplied is treated as the beginning of a new record.
 620  * To place a sender's address, optional access rights, and data in a
 621  * socket receive buffer, sbappendaddr() should be used.  To place
 622  * access rights and data in a socket receive buffer, sbappendrights()
 623  * should be used.  In either case, the new data begins a new record.
 624  * Note that unlike sbappend() and sbappendrecord(), these routines check
 625  * for the caller that there will be enough space to store the data.
 626  * Each fails if there is not enough space, or if it cannot find mbufs
 627  * to store additional information in.
 628  *
 629  * Reliable protocols may use the socket send buffer to hold data
 630  * awaiting acknowledgement.  Data is normally copied from a socket
 631  * send buffer in a protocol with m_copy for output to a peer,
 632  * and then removing the data from the socket buffer with sbdrop()
 633  * or sbdroprecord() when the data is acknowledged by the peer.
 634  */
 635
 636 /*
 637  * Append mbuf chain m to the last record in the
 638  * socket buffer sb.  The additional space associated
 639  * the mbuf chain is recorded in sb.  Empty mbufs are
 640  * discarded and mbufs are compacted where possible.
 641  */
 642 int
 643 sbappend(sb, m)
 644         struct sockbuf *sb;
 645         struct mbuf *m;
 646 {
 647         register struct mbuf *n, *sb_first;
 648         int result = 0;
 649         int error = 0;
 650         int     filtered = 0;
 651
 652
 653         KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_START), sb, m->m_len, 0, 0, 0);
 654
 655         if (m == 0)
 656                 return 0;
 657
 658 again:
 659         sb_first = n = sb->sb_mb;
 660         if (n) {
 661                 while (n->m_nextpkt)
 662                         n = n->m_nextpkt;
 663                 do {
 664                         if (n->m_flags & M_EOR) {
 665                                 result = sbappendrecord(sb, m); /* XXXXXX!!!! */
 666                                 KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0);
 667                                 return result;
 668                         }
 669                 } while (n->m_next && (n = n->m_next));
 670         }
 671
 672         if (!filtered && (sb->sb_flags & SB_RECV) != 0) {
 673                 error = sflt_data_in(sb->sb_so, NULL, &m, NULL, 0, &filtered);
 674                 if (error) {
 675                         /* no data was appended, caller should not call sowakeup */
 676                         return 0;
 677                 }
 678
 679                 /*
 680                   If we any filters, the socket lock was dropped. n and sb_first
 681                   cached data from the socket buffer. This cache is not valid
 682                   since we dropped the lock. We must start over. Since filtered
 683                   is set we won't run through the filters a second time. We just
 684                   set n and sb_start again.
 685                 */
 686                 if (filtered)
 687                         goto again;
 688         }
 689
 690         result = sbcompress(sb, m, n);
 691
 692         KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0);
 693
 694         return result;
 695 }
 696
 697 #ifdef SOCKBUF_DEBUG
 698 void
 699 sbcheck(sb)
 700         register struct sockbuf *sb;
 701 {
 702         register struct mbuf *m;
 703         register struct mbuf *n = 0;
 704         register u_long len = 0, mbcnt = 0;
 705         lck_mtx_t *mutex_held;
 706
 707         if (sb->sb_so->so_proto->pr_getlock != NULL)
 708                 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
 709         else
 710                 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
 711
 712         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 713
 714         if (sbchecking == 0)
 715                 return;
 716
 717         for (m = sb->sb_mb; m; m = n) {
 718             n = m->m_nextpkt;
 719             for (; m; m = m->m_next) {
 720                 len += m->m_len;
 721                 mbcnt += MSIZE;
 722                 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 723                     mbcnt += m->m_ext.ext_size;
 724             }
 725         }
 726         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
 727                 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
 728                     mbcnt, sb->sb_mbcnt);
 729         }
 730 }
 731 #endif
 732
 733 /*
 734  * As above, except the mbuf chain
 735  * begins a new record.
 736  */
 737 int
 738 sbappendrecord(sb, m0)
 739         register struct sockbuf *sb;
 740         struct mbuf *m0;
 741 {
 742         register struct mbuf *m;
 743         int result = 0;
 744
 745         if (m0 == 0)
 746                 return 0;
 747
 748         if ((sb->sb_flags & SB_RECV) != 0) {
 749                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, sock_data_filt_flag_record, NULL);
 750                 if (error != 0) {
 751                         if (error != EJUSTRETURN)
 752                                 m_freem(m0);
 753                         return 0;
 754                 }
 755         }
 756
 757         m = sb->sb_mb;
 758         if (m)
 759                 while (m->m_nextpkt)
 760                         m = m->m_nextpkt;
 761         /*
 762          * Put the first mbuf on the queue.
 763          * Note this permits zero length records.
 764          */
 765         sballoc(sb, m0);
 766         if (m)
 767                 m->m_nextpkt = m0;
 768         else
 769                 sb->sb_mb = m0;
 770         m = m0->m_next;
 771         m0->m_next = 0;
 772         if (m && (m0->m_flags & M_EOR)) {
 773                 m0->m_flags &= ~M_EOR;
 774                 m->m_flags |= M_EOR;
 775         }
 776         return sbcompress(sb, m, m0);
 777 }
 778
 779 /*
 780  * As above except that OOB data
 781  * is inserted at the beginning of the sockbuf,
 782  * but after any other OOB data.
 783  */
 784 int
 785 sbinsertoob(sb, m0)
 786         struct sockbuf *sb;
 787         struct mbuf *m0;
 788 {
 789         struct mbuf *m;
 790         struct mbuf **mp;
 791
 792         if (m0 == 0)
 793                 return 0;
 794
 795         if ((sb->sb_flags & SB_RECV) != 0) {
 796                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
 797                                                                  sock_data_filt_flag_oob, NULL);
 798
 799                 if (error) {
 800                         if (error != EJUSTRETURN) {
 801                                 m_freem(m0);
 802                         }
 803                         return 0;
 804                 }
 805         }
 806
 807         for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
 808             m = *mp;
 809             again:
 810                 switch (m->m_type) {
 811
 812                 case MT_OOBDATA:
 813                         continue;               /* WANT next train */
 814
 815                 case MT_CONTROL:
 816                         m = m->m_next;
 817                         if (m)
 818                                 goto again;     /* inspect THIS train further */
 819                 }
 820                 break;
 821         }
 822         /*
 823          * Put the first mbuf on the queue.
 824          * Note this permits zero length records.
 825          */
 826         sballoc(sb, m0);
 827         m0->m_nextpkt = *mp;
 828         *mp = m0;
 829         m = m0->m_next;
 830         m0->m_next = 0;
 831         if (m && (m0->m_flags & M_EOR)) {
 832                 m0->m_flags &= ~M_EOR;
 833                 m->m_flags |= M_EOR;
 834         }
 835         return sbcompress(sb, m, m0);
 836 }
 837
 838 /*
 839  * Append address and data, and optionally, control (ancillary) data
 840  * to the receive queue of a socket.  If present,
 841  * m0 must include a packet header with total length.
 842  * Returns 0 if no space in sockbuf or insufficient mbufs.
 843  */
 844 static int
 845 sbappendaddr_internal(sb, asa, m0, control)
 846         register struct sockbuf *sb;
 847         struct sockaddr *asa;
 848         struct mbuf *m0, *control;
 849 {
 850         register struct mbuf *m, *n;
 851         int space = asa->sa_len;
 852
 853         if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 854                 panic("sbappendaddr");
 855
 856         if (m0)
 857                 space += m0->m_pkthdr.len;
 858         for (n = control; n; n = n->m_next) {
 859                 space += n->m_len;
 860                 if (n->m_next == 0)     /* keep pointer to last control buf */
 861                         break;
 862         }
 863         if (space > sbspace(sb))
 864                 return (0);
 865         if (asa->sa_len > MLEN)
 866                 return (0);
 867         MGET(m, M_DONTWAIT, MT_SONAME);
 868         if (m == 0)
 869                 return (0);
 870         m->m_len = asa->sa_len;
 871         bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
 872         if (n)
 873                 n->m_next = m0;         /* concatenate data to control */
 874         else
 875                 control = m0;
 876         m->m_next = control;
 877         for (n = m; n; n = n->m_next)
 878                 sballoc(sb, n);
 879         n = sb->sb_mb;
 880         if (n) {
 881                 while (n->m_nextpkt)
 882                         n = n->m_nextpkt;
 883                 n->m_nextpkt = m;
 884         } else
 885                 sb->sb_mb = m;
 886         postevent(0,sb,EV_RWBYTES);
 887         return (1);
 888 }
 889
 890 int
 891 sbappendaddr(
 892         struct sockbuf* sb,
 893         struct sockaddr* asa,
 894         struct mbuf *m0,
 895         struct mbuf *control,
 896         int     *error_out)
 897 {
 898         int result = 0;
 899
 900         if (error_out) *error_out = 0;
 901
 902         if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 903                 panic("sbappendaddrorfree");
 904
 905         /* Call socket data in filters */
 906         if ((sb->sb_flags & SB_RECV) != 0) {
 907                 int error;
 908                 error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0, NULL);
 909                 if (error) {
 910                         if (error != EJUSTRETURN) {
 911                                 if (m0) m_freem(m0);
 912                                 if (control) m_freem(control);
 913                                 if (error_out) *error_out = error;
 914                         }
 915                         return 0;
 916                 }
 917         }
 918
 919         result = sbappendaddr_internal(sb, asa, m0, control);
 920         if (result == 0) {
 921                 if (m0) m_freem(m0);
 922                 if (control) m_freem(control);
 923                 if (error_out) *error_out = ENOBUFS;
 924         }
 925
 926         return result;
 927 }
 928
 929 static int
 930 sbappendcontrol_internal(sb, m0, control)
 931         struct sockbuf *sb;
 932         struct mbuf *control, *m0;
 933 {
 934         register struct mbuf *m, *n;
 935         int space = 0;
 936
 937         if (control == 0)
 938                 panic("sbappendcontrol");
 939
 940         for (m = control; ; m = m->m_next) {
 941                 space += m->m_len;
 942                 if (m->m_next == 0)
 943                         break;
 944         }
 945         n = m;                  /* save pointer to last control buffer */
 946         for (m = m0; m; m = m->m_next)
 947                 space += m->m_len;
 948         if (space > sbspace(sb))
 949                 return (0);
 950         n->m_next = m0;                 /* concatenate data to control */
 951         for (m = control; m; m = m->m_next)
 952                 sballoc(sb, m);
 953         n = sb->sb_mb;
 954         if (n) {
 955                 while (n->m_nextpkt)
 956                         n = n->m_nextpkt;
 957                 n->m_nextpkt = control;
 958         } else
 959                 sb->sb_mb = control;
 960         postevent(0,sb,EV_RWBYTES);
 961         return (1);
 962 }
 963
 964 int
 965 sbappendcontrol(
 966         struct sockbuf  *sb,
 967         struct mbuf             *m0,
 968         struct mbuf             *control,
 969         int                             *error_out)
 970 {
 971         int result = 0;
 972
 973         if (error_out) *error_out = 0;
 974
 975         if (sb->sb_flags & SB_RECV) {
 976                 int error;
 977                 error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0, NULL);
 978                 if (error) {
 979                         if (error != EJUSTRETURN) {
 980                                 if (m0) m_freem(m0);
 981                                 if (control) m_freem(control);
 982                                 if (error_out) *error_out = error;
 983                         }
 984                         return 0;
 985                 }
 986         }
 987
 988         result = sbappendcontrol_internal(sb, m0, control);
 989         if (result == 0) {
 990                 if (m0) m_freem(m0);
 991                 if (control) m_freem(control);
 992                 if (error_out) *error_out = ENOBUFS;
 993         }
 994
 995         return result;
 996 }
 997
 998 /*
 999  * Compress mbuf chain m into the socket
1000  * buffer sb following mbuf n.  If n
1001  * is null, the buffer is presumed empty.
1002  */
1003 static int
1004 sbcompress(sb, m, n)
1005         register struct sockbuf *sb;
1006         register struct mbuf *m, *n;
1007 {
1008         register int eor = 0;
1009         register struct mbuf *o;
1010
1011         while (m) {
1012                 eor |= m->m_flags & M_EOR;
1013                 if (m->m_len == 0 &&
1014                     (eor == 0 ||
1015                      (((o = m->m_next) || (o = n)) &&
1016                       o->m_type == m->m_type))) {
1017                         m = m_free(m);
1018                         continue;
1019                 }
1020                 if (n && (n->m_flags & M_EOR) == 0 &&
1021 #ifndef __APPLE__
1022                     M_WRITABLE(n) &&
1023 #endif
1024                     m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1025                     m->m_len <= M_TRAILINGSPACE(n) &&
1026                     n->m_type == m->m_type) {
1027                         bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1028                             (unsigned)m->m_len);
1029                         n->m_len += m->m_len;
1030                         sb->sb_cc += m->m_len;
1031                         m = m_free(m);
1032                         continue;
1033                 }
1034                 if (n)
1035                         n->m_next = m;
1036                 else
1037                         sb->sb_mb = m;
1038                 sballoc(sb, m);
1039                 n = m;
1040                 m->m_flags &= ~M_EOR;
1041                 m = m->m_next;
1042                 n->m_next = 0;
1043         }
1044         if (eor) {
1045                 if (n)
1046                         n->m_flags |= eor;
1047                 else
1048                         printf("semi-panic: sbcompress\n");
1049         }
1050         postevent(0,sb, EV_RWBYTES);
1051         return 1;
1052 }
1053
1054 /*
1055  * Free all mbufs in a sockbuf.
1056  * Check that all resources are reclaimed.
1057  */
1058 void
1059 sbflush(sb)
1060         register struct sockbuf *sb;
1061 {
1062         if (sb->sb_so == NULL)
1063                 panic ("sbflush sb->sb_so already null sb=%x\n", sb);
1064         (void)sblock(sb, M_WAIT);
1065         while (sb->sb_mbcnt) {
1066                 /*
1067                  * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1068                  * we would loop forever. Panic instead.
1069                  */
1070                 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
1071                         break;
1072                 sbdrop(sb, (int)sb->sb_cc);
1073         }
1074         if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_so == NULL)
1075                 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt, sb->sb_so);
1076
1077         postevent(0, sb, EV_RWBYTES);
1078         sbunlock(sb, 1);        /* keep socket locked */
1079
1080 }
1081
1082 /*
1083  * Drop data from (the front of) a sockbuf.
1084  * use m_freem_list to free the mbuf structures
1085  * under a single lock... this is done by pruning
1086  * the top of the tree from the body by keeping track
1087  * of where we get to in the tree and then zeroing the
1088  * two pertinent pointers m_nextpkt and m_next
1089  * the socket buffer is then updated to point at the new
1090  * top of the tree and the pruned area is released via
1091  * m_freem_list.
1092  */
1093 void
1094 sbdrop(sb, len)
1095         register struct sockbuf *sb;
1096         register int len;
1097 {
1098         register struct mbuf *m, *free_list, *ml;
1099         struct mbuf *next, *last;
1100
1101         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
1102
1103         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1104         free_list = last = m;
1105         ml = (struct mbuf *)0;
1106
1107         while (len > 0) {
1108                 if (m == 0) {
1109                   if (next == 0) {
1110                     /* temporarily replacing this panic with printf because
1111                      * it occurs occasionally when closing a socket when there
1112                      * is no harm in ignoring it.  This problem will be investigated
1113                      * further.
1114                      */
1115                     /* panic("sbdrop"); */
1116                     printf("sbdrop - count not zero\n");
1117                     len = 0;
1118                     /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1119                     sb->sb_cc = 0;
1120                     sb->sb_mbcnt = 0;
1121                     break;
1122                   }
1123                   m = last = next;
1124                   next = m->m_nextpkt;
1125                   continue;
1126                 }
1127                 if (m->m_len > len) {
1128                         m->m_len -= len;
1129                         m->m_data += len;
1130                         sb->sb_cc -= len;
1131                         break;
1132                 }
1133                 len -= m->m_len;
1134                 sbfree(sb, m);
1135
1136                 ml = m;
1137                 m = m->m_next;
1138         }
1139         while (m && m->m_len == 0) {
1140                 sbfree(sb, m);
1141
1142                 ml = m;
1143                 m = m->m_next;
1144         }
1145         if (ml) {
1146                 ml->m_next = (struct mbuf *)0;
1147                 last->m_nextpkt = (struct mbuf *)0;
1148                 m_freem_list(free_list);
1149         }
1150         if (m) {
1151                 sb->sb_mb = m;
1152                 m->m_nextpkt = next;
1153         } else
1154                 sb->sb_mb = next;
1155
1156         postevent(0, sb, EV_RWBYTES);
1157
1158         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
1159 }
1160
1161 /*
1162  * Drop a record off the front of a sockbuf
1163  * and move the next record to the front.
1164  */
1165 void
1166 sbdroprecord(sb)
1167         register struct sockbuf *sb;
1168 {
1169         register struct mbuf *m, *mn;
1170
1171         m = sb->sb_mb;
1172         if (m) {
1173                 sb->sb_mb = m->m_nextpkt;
1174                 do {
1175                         sbfree(sb, m);
1176                         MFREE(m, mn);
1177                         m = mn;
1178                 } while (m);
1179         }
1180         postevent(0, sb, EV_RWBYTES);
1181 }
1182
1183 /*
1184  * Create a "control" mbuf containing the specified data
1185  * with the specified type for presentation on a socket buffer.
1186  */
1187 struct mbuf *
1188 sbcreatecontrol(p, size, type, level)
1189         caddr_t p;
1190         register int size;
1191         int type, level;
1192 {
1193         register struct cmsghdr *cp;
1194         struct mbuf *m;
1195
1196         if (CMSG_SPACE((u_int)size) > MLEN)
1197                 return ((struct mbuf *) NULL);
1198         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1199                 return ((struct mbuf *) NULL);
1200         cp = mtod(m, struct cmsghdr *);
1201         /* XXX check size? */
1202         (void)memcpy(CMSG_DATA(cp), p, size);
1203         m->m_len = CMSG_SPACE(size);
1204         cp->cmsg_len = CMSG_LEN(size);
1205         cp->cmsg_level = level;
1206         cp->cmsg_type = type;
1207         return (m);
1208 }
1209
1210 /*
1211  * Some routines that return EOPNOTSUPP for entry points that are not
1212  * supported by a protocol.  Fill in as needed.
1213  */
1214 int
1215 pru_abort_notsupp(struct socket *so)
1216 {
1217         return EOPNOTSUPP;
1218 }
1219
1220
1221 int
1222 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
1223 {
1224         return EOPNOTSUPP;
1225 }
1226
1227 int
1228 pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
1229 {
1230         return EOPNOTSUPP;
1231 }
1232
1233 int
1234 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1235 {
1236         return EOPNOTSUPP;
1237 }
1238
1239 int
1240 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1241 {
1242         return EOPNOTSUPP;
1243 }
1244
1245 int
1246 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
1247 {
1248         return EOPNOTSUPP;
1249 }
1250
1251 int
1252 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
1253                     struct ifnet *ifp, struct proc *p)
1254 {
1255         return EOPNOTSUPP;
1256 }
1257
1258 int
1259 pru_detach_notsupp(struct socket *so)
1260 {
1261         return EOPNOTSUPP;
1262 }
1263
1264 int
1265 pru_disconnect_notsupp(struct socket *so)
1266 {
1267         return EOPNOTSUPP;
1268 }
1269
1270 int
1271 pru_listen_notsupp(struct socket *so, struct proc *p)
1272 {
1273         return EOPNOTSUPP;
1274 }
1275
1276 int
1277 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
1278 {
1279         return EOPNOTSUPP;
1280 }
1281
1282 int
1283 pru_rcvd_notsupp(struct socket *so, int flags)
1284 {
1285         return EOPNOTSUPP;
1286 }
1287
1288 int
1289 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
1290 {
1291         return EOPNOTSUPP;
1292 }
1293
1294 int
1295 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
1296                  struct sockaddr *addr, struct mbuf *control,
1297                  struct proc *p)
1298
1299 {
1300         return EOPNOTSUPP;
1301 }
1302
1303
1304 /*
1305  * This isn't really a ``null'' operation, but it's the default one
1306  * and doesn't do anything destructive.
1307  */
1308 int
1309 pru_sense_null(struct socket *so, struct stat *sb)
1310 {
1311         sb->st_blksize = so->so_snd.sb_hiwat;
1312         return 0;
1313 }
1314
1315
1316 int     pru_sosend_notsupp(struct socket *so, struct sockaddr *addr,
1317                    struct uio *uio, struct mbuf *top,
1318                    struct mbuf *control, int flags)
1319
1320 {
1321     return EOPNOTSUPP;
1322 }
1323
1324 int     pru_soreceive_notsupp(struct socket *so,
1325                       struct sockaddr **paddr,
1326                       struct uio *uio, struct mbuf **mp0,
1327                       struct mbuf **controlp, int *flagsp)
1328 {
1329     return EOPNOTSUPP;
1330 }
1331
1332 int
1333
1334 pru_shutdown_notsupp(struct socket *so)
1335 {
1336         return EOPNOTSUPP;
1337 }
1338
1339 int
1340 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
1341 {
1342         return EOPNOTSUPP;
1343 }
1344
1345 int     pru_sosend(struct socket *so, struct sockaddr *addr,
1346                    struct uio *uio, struct mbuf *top,
1347                    struct mbuf *control, int flags)
1348 {
1349         return EOPNOTSUPP;
1350 }
1351
1352 int     pru_soreceive(struct socket *so,
1353                       struct sockaddr **paddr,
1354                       struct uio *uio, struct mbuf **mp0,
1355                       struct mbuf **controlp, int *flagsp)
1356 {
1357         return EOPNOTSUPP;
1358 }
1359
1360
1361 int
1362 pru_sopoll_notsupp(__unused struct socket *so, __unused int events,
1363                    __unused kauth_cred_t cred, __unused void *wql)
1364 {
1365     return EOPNOTSUPP;
1366 }
1367
1368
1369 #ifdef __APPLE__
1370 /*
1371  * The following are macros on BSD and functions on Darwin
1372  */
1373
1374 /*
1375  * Do we need to notify the other side when I/O is possible?
1376  */
1377
1378 int
1379 sb_notify(struct sockbuf *sb)
1380 {
1381         return ((sb->sb_flags & (SB_WAIT|SB_SEL|SB_ASYNC|SB_UPCALL|SB_KNOTE)) != 0);
1382 }
1383
1384 /*
1385  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1386  * This is problematical if the fields are unsigned, as the space might
1387  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
1388  * overflow and return 0.  Should use "lmin" but it doesn't exist now.
1389  */
1390 long
1391 sbspace(struct sockbuf *sb)
1392 {
1393     return ((long) imin((int)(sb->sb_hiwat - sb->sb_cc),
1394          (int)(sb->sb_mbmax - sb->sb_mbcnt)));
1395 }
1396
1397 /* do we have to send all at once on a socket? */
1398 int
1399 sosendallatonce(struct socket *so)
1400 {
1401     return (so->so_proto->pr_flags & PR_ATOMIC);
1402 }
1403
1404 /* can we read something from so? */
1405 int
1406 soreadable(struct socket *so)
1407 {
1408     return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1409         (so->so_state & SS_CANTRCVMORE) ||
1410         so->so_comp.tqh_first || so->so_error);
1411 }
1412
1413 /* can we write something to so? */
1414
1415 int
1416 sowriteable(struct socket *so)
1417 {
1418     return ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat &&
1419         ((so->so_state&SS_ISCONNECTED) ||
1420           (so->so_proto->pr_flags&PR_CONNREQUIRED)==0)) ||
1421      (so->so_state & SS_CANTSENDMORE) ||
1422      so->so_error);
1423 }
1424
1425 /* adjust counters in sb reflecting allocation of m */
1426
1427 void
1428 sballoc(struct sockbuf *sb, struct mbuf *m)
1429 {
1430         sb->sb_cc += m->m_len;
1431         sb->sb_mbcnt += MSIZE;
1432         if (m->m_flags & M_EXT)
1433                 sb->sb_mbcnt += m->m_ext.ext_size;
1434 }
1435
1436 /* adjust counters in sb reflecting freeing of m */
1437 void
1438 sbfree(struct sockbuf *sb, struct mbuf *m)
1439 {
1440         sb->sb_cc -= m->m_len;
1441         sb->sb_mbcnt -= MSIZE;
1442         if (m->m_flags & M_EXT)
1443                 sb->sb_mbcnt -= m->m_ext.ext_size;
1444 }
1445
1446 /*
1447  * Set lock on sockbuf sb; sleep if lock is already held.
1448  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1449  * Returns error without lock if sleep is interrupted.
1450  */
1451 int
1452 sblock(struct sockbuf *sb, int wf)
1453 {
1454         return(sb->sb_flags & SB_LOCK ?
1455                 ((wf == M_WAIT) ? sb_lock(sb) : EWOULDBLOCK) :
1456                 (sb->sb_flags |= SB_LOCK), 0);
1457 }
1458
1459 /* release lock on sockbuf sb */
1460 void
1461 sbunlock(struct sockbuf *sb, int keeplocked)
1462 {
1463         struct socket *so = sb->sb_so;
1464         int lr_saved;
1465         lck_mtx_t *mutex_held;
1466
1467
1468         lr_saved = (unsigned int) __builtin_return_address(0);
1469
1470         sb->sb_flags &= ~SB_LOCK;
1471
1472         if (so->so_proto->pr_getlock != NULL)
1473                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1474         else
1475                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1476
1477         if (keeplocked == 0)
1478                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1479
1480         if (sb->sb_flags & SB_WANT) {
1481                 sb->sb_flags &= ~SB_WANT;
1482                 if (so->so_usecount < 0)
1483                         panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb->sb_so, so->so_usecount, lr_saved, sb->sb_flags);
1484
1485                 wakeup((caddr_t)&(sb)->sb_flags);
1486         }
1487         if (keeplocked == 0) {  /* unlock on exit */
1488                 so->so_usecount--;
1489                 if (so->so_usecount < 0)
1490                         panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so, so->so_usecount,lr_saved, sb->sb_flags);
1491                 so->unlock_lr[so->next_unlock_lr] = (void *)lr_saved;
1492                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
1493                 lck_mtx_unlock(mutex_held);
1494         }
1495 }
1496
1497 void
1498 sorwakeup(struct socket * so)
1499 {
1500   if (sb_notify(&so->so_rcv))
1501         sowakeup(so, &so->so_rcv);
1502 }
1503
1504 void
1505 sowwakeup(struct socket * so)
1506 {
1507   if (sb_notify(&so->so_snd))
1508         sowakeup(so, &so->so_snd);
1509 }
1510 #endif __APPLE__
1511
1512 /*
1513  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1514  */
1515 struct sockaddr *
1516 dup_sockaddr(sa, canwait)
1517         struct sockaddr *sa;
1518         int canwait;
1519 {
1520         struct sockaddr *sa2;
1521
1522         MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
1523                canwait ? M_WAITOK : M_NOWAIT);
1524         if (sa2)
1525                 bcopy(sa, sa2, sa->sa_len);
1526         return sa2;
1527 }
1528
1529 /*
1530  * Create an external-format (``xsocket'') structure using the information
1531  * in the kernel-format socket structure pointed to by so.  This is done
1532  * to reduce the spew of irrelevant information over this interface,
1533  * to isolate user code from changes in the kernel structure, and
1534  * potentially to provide information-hiding if we decide that
1535  * some of this information should be hidden from users.
1536  */
1537 void
1538 sotoxsocket(struct socket *so, struct xsocket *xso)
1539 {
1540         xso->xso_len = sizeof *xso;
1541         xso->xso_so = so;
1542         xso->so_type = so->so_type;
1543         xso->so_options = so->so_options;
1544         xso->so_linger = so->so_linger;
1545         xso->so_state = so->so_state;
1546         xso->so_pcb = so->so_pcb;
1547         if (so->so_proto) {
1548                 xso->xso_protocol = so->so_proto->pr_protocol;
1549                 xso->xso_family = so->so_proto->pr_domain->dom_family;
1550         }
1551         else
1552                 xso->xso_protocol = xso->xso_family = 0;
1553         xso->so_qlen = so->so_qlen;
1554         xso->so_incqlen = so->so_incqlen;
1555         xso->so_qlimit = so->so_qlimit;
1556         xso->so_timeo = so->so_timeo;
1557         xso->so_error = so->so_error;
1558         xso->so_pgid = so->so_pgid;
1559         xso->so_oobmark = so->so_oobmark;
1560         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
1561         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
1562         xso->so_uid = so->so_uid;
1563 }
1564
1565 /*
1566  * This does the same for sockbufs.  Note that the xsockbuf structure,
1567  * since it is always embedded in a socket, does not include a self
1568  * pointer nor a length.  We make this entry point public in case
1569  * some other mechanism needs it.
1570  */
1571 void
1572 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
1573 {
1574         xsb->sb_cc = sb->sb_cc;
1575         xsb->sb_hiwat = sb->sb_hiwat;
1576         xsb->sb_mbcnt = sb->sb_mbcnt;
1577         xsb->sb_mbmax = sb->sb_mbmax;
1578         xsb->sb_lowat = sb->sb_lowat;
1579         xsb->sb_flags = sb->sb_flags;
1580         xsb->sb_timeo = (u_long)(sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick;
1581         if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0)
1582                 xsb->sb_timeo = 1;
1583 }
1584
1585 /*
1586  * Here is the definition of some of the basic objects in the kern.ipc
1587  * branch of the MIB.
1588  */
1589 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
1590
1591 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1592 static int dummy;
1593 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
1594
1595 SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW,
1596     &sb_max, 0, "Maximum socket buffer size");
1597 SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD,
1598     &maxsockets, 0, "Maximum number of sockets avaliable");
1599 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
1600            &sb_efficiency, 0, "");
1601 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
1602