bsd/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
  23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  24 /*
  25  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  * 1. Redistributions of source code must retain the above copyright
  32  *    notice, this list of conditions and the following disclaimer.
  33  * 2. Redistributions in binary form must reproduce the above copyright
  34  *    notice, this list of conditions and the following disclaimer in the
  35  *    documentation and/or other materials provided with the distribution.
  36  * 3. All advertising materials mentioning features or use of this software
  37  *    must display the following acknowledgement:
  38  *      This product includes software developed by the University of
  39  *      California, Berkeley and its contributors.
  40  * 4. Neither the name of the University nor the names of its contributors
  41  *    may be used to endorse or promote products derived from this software
  42  *    without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  45  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  47  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  48  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  49  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  50  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  51  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  52  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  53  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54  * SUCH DAMAGE.
  55  *
  56  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  57  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
  58  */
  59
  60 #include <sys/param.h>
  61 #include <sys/systm.h>
  62 #include <sys/domain.h>
  63 #include <sys/kernel.h>
  64 #include <sys/proc_internal.h>
  65 #include <sys/kauth.h>
  66 #include <sys/malloc.h>
  67 #include <sys/mbuf.h>
  68 #include <sys/protosw.h>
  69 #include <sys/stat.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/signalvar.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/ev.h>
  75 #include <kern/locks.h>
  76 #include <net/route.h>
  77 #include <netinet/in.h>
  78 #include <netinet/in_pcb.h>
  79 #include <sys/kdebug.h>
  80
  81 #define DBG_FNC_SBDROP  NETDBG_CODE(DBG_NETSOCK, 4)
  82 #define DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
  83
  84
  85 /*
  86  * Primitive routines for operating on sockets and socket buffers
  87  */
  88
  89 u_long  sb_max = SB_MAX;                /* XXX should be static */
  90
  91 static  u_long sb_efficiency = 8;       /* parameter for sbreserve() */
  92
  93 /*
  94  * Procedures to manipulate state flags of socket
  95  * and do appropriate wakeups.  Normal sequence from the
  96  * active (originating) side is that soisconnecting() is
  97  * called during processing of connect() call,
  98  * resulting in an eventual call to soisconnected() if/when the
  99  * connection is established.  When the connection is torn down
 100  * soisdisconnecting() is called during processing of disconnect() call,
 101  * and soisdisconnected() is called when the connection to the peer
 102  * is totally severed.  The semantics of these routines are such that
 103  * connectionless protocols can call soisconnected() and soisdisconnected()
 104  * only, bypassing the in-progress calls when setting up a ``connection''
 105  * takes no time.
 106  *
 107  * From the passive side, a socket is created with
 108  * two queues of sockets: so_incomp for connections in progress
 109  * and so_comp for connections already made and awaiting user acceptance.
 110  * As a protocol is preparing incoming connections, it creates a socket
 111  * structure queued on so_incomp by calling sonewconn().  When the connection
 112  * is established, soisconnected() is called, and transfers the
 113  * socket structure to so_comp, making it available to accept().
 114  *
 115  * If a socket is closed with sockets on either
 116  * so_incomp or so_comp, these sockets are dropped.
 117  *
 118  * If higher level protocols are implemented in
 119  * the kernel, the wakeups done here will sometimes
 120  * cause software-interrupt process scheduling.
 121  */
 122 void
 123 soisconnecting(so)
 124         register struct socket *so;
 125 {
 126
 127         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 128         so->so_state |= SS_ISCONNECTING;
 129
 130         sflt_notify(so, sock_evt_connecting, NULL);
 131 }
 132
 133 void
 134 soisconnected(so)
 135         struct socket *so;
 136 {
 137         struct socket *head = so->so_head;
 138
 139         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 140         so->so_state |= SS_ISCONNECTED;
 141
 142         sflt_notify(so, sock_evt_connected, NULL);
 143
 144         if (head && (so->so_state & SS_INCOMP)) {
 145                 so->so_state &= ~SS_INCOMP;
 146                 so->so_state |= SS_COMP;
 147                 if (head->so_proto->pr_getlock != NULL) {
 148                         socket_unlock(so, 0);
 149                         socket_lock(head, 1);
 150                 }
 151                 postevent(head, 0, EV_RCONN);
 152                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 153                 head->so_incqlen--;
 154                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 155                 sorwakeup(head);
 156                 wakeup_one((caddr_t)&head->so_timeo);
 157                 if (head->so_proto->pr_getlock != NULL) {
 158                         socket_unlock(head, 1);
 159                         socket_lock(so, 0);
 160                 }
 161         } else {
 162                 postevent(so, 0, EV_WCONN);
 163                 wakeup((caddr_t)&so->so_timeo);
 164                 sorwakeup(so);
 165                 sowwakeup(so);
 166         }
 167 }
 168
 169 void
 170 soisdisconnecting(so)
 171         register struct socket *so;
 172 {
 173         so->so_state &= ~SS_ISCONNECTING;
 174         so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
 175         sflt_notify(so, sock_evt_disconnecting, NULL);
 176         wakeup((caddr_t)&so->so_timeo);
 177         sowwakeup(so);
 178         sorwakeup(so);
 179 }
 180
 181 void
 182 soisdisconnected(so)
 183         register struct socket *so;
 184 {
 185         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 186         so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
 187         sflt_notify(so, sock_evt_disconnected, NULL);
 188         wakeup((caddr_t)&so->so_timeo);
 189         sowwakeup(so);
 190         sorwakeup(so);
 191 }
 192
 193 /*
 194  * Return a random connection that hasn't been serviced yet and
 195  * is eligible for discard.  There is a one in qlen chance that
 196  * we will return a null, saying that there are no dropable
 197  * requests.  In this case, the protocol specific code should drop
 198  * the new request.  This insures fairness.
 199  *
 200  * This may be used in conjunction with protocol specific queue
 201  * congestion routines.
 202  */
 203 struct socket *
 204 sodropablereq(head)
 205         register struct socket *head;
 206 {
 207         struct socket *so, *sonext = NULL;
 208         unsigned int i, j, qlen;
 209         static int rnd;
 210         static struct timeval old_runtime;
 211         static unsigned int cur_cnt, old_cnt;
 212         struct timeval tv;
 213
 214         microtime(&tv);
 215         if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
 216                 old_runtime = tv;
 217                 old_cnt = cur_cnt / i;
 218                 cur_cnt = 0;
 219         }
 220
 221         so = TAILQ_FIRST(&head->so_incomp);
 222         if (!so)
 223                 return (NULL);
 224
 225         qlen = head->so_incqlen;
 226         if (++cur_cnt > qlen || old_cnt > qlen) {
 227                 rnd = (314159 * rnd + 66329) & 0xffff;
 228                 j = ((qlen + 1) * rnd) >> 16;
 229 //###LD To clean up
 230                 while (j-- && so) {
 231 //                      if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
 232                                 socket_lock(so, 1);
 233                                 sonext = TAILQ_NEXT(so, so_list);
 234 //                              in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
 235                                 socket_unlock(so, 1);
 236                                 so = sonext;
 237                 }
 238         }
 239
 240 //      if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 241 //              return (NULL);
 242 //      else
 243                 return (so);
 244 }
 245
 246 /*
 247  * When an attempt at a new connection is noted on a socket
 248  * which accepts connections, sonewconn is called.  If the
 249  * connection is possible (subject to space constraints, etc.)
 250  * then we allocate a new structure, propoerly linked into the
 251  * data structure of the original socket, and return this.
 252  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 253  */
 254 static struct socket *
 255 sonewconn_internal(head, connstatus)
 256         register struct socket *head;
 257         int connstatus;
 258 {
 259         int error = 0;
 260         register struct socket *so;
 261         lck_mtx_t *mutex_held;
 262
 263         if (head->so_proto->pr_getlock != NULL)
 264                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 265         else
 266                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 267         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 268
 269         if (head->so_qlen > 3 * head->so_qlimit / 2)
 270                 return ((struct socket *)0);
 271         so = soalloc(1, head->so_proto->pr_domain->dom_family, head->so_type);
 272         if (so == NULL)
 273                 return ((struct socket *)0);
 274         /* check if head was closed during the soalloc */
 275         if (head->so_proto == NULL) {
 276           sodealloc(so);
 277           return ((struct socket *)0);
 278         }
 279
 280         so->so_head = head;
 281         so->so_type = head->so_type;
 282         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 283         so->so_linger = head->so_linger;
 284         so->so_state = head->so_state | SS_NOFDREF;
 285         so->so_proto = head->so_proto;
 286         so->so_timeo = head->so_timeo;
 287         so->so_pgid  = head->so_pgid;
 288         so->so_uid = head->so_uid;
 289         so->so_usecount = 1;
 290
 291         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 292                 sflt_termsock(so);
 293                 sodealloc(so);
 294                 return ((struct socket *)0);
 295         }
 296
 297         /*
 298          * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
 299          */
 300         if (head->so_proto->pr_unlock)
 301                 socket_unlock(head, 0);
 302         if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) || error) {
 303                 sflt_termsock(so);
 304                 sodealloc(so);
 305                 if (head->so_proto->pr_unlock)
 306                         socket_lock(head, 0);
 307                 return ((struct socket *)0);
 308         }
 309         if (head->so_proto->pr_unlock)
 310                 socket_lock(head, 0);
 311 #ifdef __APPLE__
 312         so->so_proto->pr_domain->dom_refs++;
 313 #endif
 314
 315         if (connstatus) {
 316                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 317                 so->so_state |= SS_COMP;
 318         } else {
 319                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 320                 so->so_state |= SS_INCOMP;
 321                 head->so_incqlen++;
 322         }
 323         head->so_qlen++;
 324 #ifdef __APPLE__
 325         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 326         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 327         TAILQ_INIT(&so->so_evlist);
 328
 329         /* Attach socket filters for this protocol */
 330         sflt_initsock(so);
 331 #endif
 332         if (connstatus) {
 333                 so->so_state |= connstatus;
 334                 sorwakeup(head);
 335                 wakeup((caddr_t)&head->so_timeo);
 336         }
 337         return (so);
 338 }
 339
 340
 341 struct socket *
 342 sonewconn(
 343         struct socket *head,
 344         int connstatus,
 345         const struct sockaddr *from)
 346 {
 347         int error = 0;
 348         struct socket_filter_entry      *filter;
 349         int                                                     filtered = 0;
 350
 351         error = 0;
 352         for (filter = head->so_filt; filter && (error == 0);
 353                  filter = filter->sfe_next_onsocket) {
 354                 if (filter->sfe_filter->sf_filter.sf_connect_in) {
 355                         if (filtered == 0) {
 356                                 filtered = 1;
 357                                 sflt_use(head);
 358                                 socket_unlock(head, 0);
 359                         }
 360                         error = filter->sfe_filter->sf_filter.sf_connect_in(
 361                                                 filter->sfe_cookie, head, from);
 362                 }
 363         }
 364         if (filtered != 0) {
 365                 socket_lock(head, 0);
 366                 sflt_unuse(head);
 367         }
 368
 369         if (error) {
 370                 return NULL;
 371         }
 372
 373         return sonewconn_internal(head, connstatus);
 374 }
 375
 376 /*
 377  * Socantsendmore indicates that no more data will be sent on the
 378  * socket; it would normally be applied to a socket when the user
 379  * informs the system that no more data is to be sent, by the protocol
 380  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 381  * will be received, and will normally be applied to the socket by a
 382  * protocol when it detects that the peer will send no more data.
 383  * Data queued for reading in the socket may yet be read.
 384  */
 385
 386 void
 387 socantsendmore(so)
 388         struct socket *so;
 389 {
 390         so->so_state |= SS_CANTSENDMORE;
 391         sflt_notify(so, sock_evt_cantsendmore, NULL);
 392         sowwakeup(so);
 393 }
 394
 395 void
 396 socantrcvmore(so)
 397         struct socket *so;
 398 {
 399         so->so_state |= SS_CANTRCVMORE;
 400         sflt_notify(so, sock_evt_cantrecvmore, NULL);
 401         sorwakeup(so);
 402 }
 403
 404 /*
 405  * Wait for data to arrive at/drain from a socket buffer.
 406  */
 407 int
 408 sbwait(sb)
 409         struct sockbuf *sb;
 410 {
 411         int error = 0, lr, lr_saved;
 412         struct socket *so = sb->sb_so;
 413         lck_mtx_t *mutex_held;
 414         struct timespec ts;
 415
 416 #ifdef __ppc__
 417         __asm__ volatile("mflr %0" : "=r" (lr));
 418         lr_saved = lr;
 419 #endif
 420
 421
 422         if (so->so_proto->pr_getlock != NULL)
 423                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 424         else
 425                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 426
 427         sb->sb_flags |= SB_WAIT;
 428
 429         if (so->so_usecount < 1)
 430                 panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount);
 431         ts.tv_sec = sb->sb_timeo.tv_sec;
 432         ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
 433         error = msleep((caddr_t)&sb->sb_cc, mutex_held,
 434                 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 435                 &ts);
 436
 437         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 438
 439         if (so->so_usecount < 1)
 440                 panic("sbwait: so=%x refcount=%d\n", so, so->so_usecount);
 441
 442         if ((so->so_state & SS_DRAINING)) {
 443                 error = EBADF;
 444         }
 445
 446         return (error);
 447 }
 448
 449 /*
 450  * Lock a sockbuf already known to be locked;
 451  * return any error returned from sleep (EINTR).
 452  */
 453 int
 454 sb_lock(sb)
 455         register struct sockbuf *sb;
 456 {
 457         struct socket *so = sb->sb_so;
 458         lck_mtx_t * mutex_held;
 459         int error = 0, lr, lr_saved;
 460
 461 #ifdef __ppc__
 462         __asm__ volatile("mflr %0" : "=r" (lr));
 463         lr_saved = lr;
 464 #endif
 465
 466         if (so == NULL)
 467                 panic("sb_lock: null so back pointer sb=%x\n", sb);
 468
 469         while (sb->sb_flags & SB_LOCK) {
 470                 sb->sb_flags |= SB_WANT;
 471                 if (so->so_proto->pr_getlock != NULL)
 472                         mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 473                 else
 474                         mutex_held = so->so_proto->pr_domain->dom_mtx;
 475                 if (so->so_usecount < 1)
 476                         panic("sb_lock: so=%x refcount=%d\n", so, so->so_usecount);
 477                 error = msleep((caddr_t)&sb->sb_flags, mutex_held,
 478                         (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sblock", 0);
 479                 if (so->so_usecount < 1)
 480                         panic("sb_lock: 2 so=%x refcount=%d\n", so, so->so_usecount);
 481                 if (error)
 482                         return (error);
 483         }
 484         sb->sb_flags |= SB_LOCK;
 485         return (0);
 486 }
 487
 488 /*
 489  * Wakeup processes waiting on a socket buffer.
 490  * Do asynchronous notification via SIGIO
 491  * if the socket has the SS_ASYNC flag set.
 492  */
 493 void
 494 sowakeup(so, sb)
 495         register struct socket *so;
 496         register struct sockbuf *sb;
 497 {
 498         struct proc *p = current_proc();
 499         sb->sb_flags &= ~SB_SEL;
 500         selwakeup(&sb->sb_sel);
 501         if (sb->sb_flags & SB_WAIT) {
 502                 sb->sb_flags &= ~SB_WAIT;
 503                 wakeup((caddr_t)&sb->sb_cc);
 504         }
 505         if (so->so_state & SS_ASYNC) {
 506                 if (so->so_pgid < 0)
 507                         gsignal(-so->so_pgid, SIGIO);
 508                 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
 509                         psignal(p, SIGIO);
 510         }
 511         if (sb->sb_flags & SB_KNOTE) {
 512                 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
 513         }
 514         if (sb->sb_flags & SB_UPCALL) {
 515                 socket_unlock(so, 0);
 516                 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
 517                 socket_lock(so, 0);
 518         }
 519 }
 520
 521 /*
 522  * Socket buffer (struct sockbuf) utility routines.
 523  *
 524  * Each socket contains two socket buffers: one for sending data and
 525  * one for receiving data.  Each buffer contains a queue of mbufs,
 526  * information about the number of mbufs and amount of data in the
 527  * queue, and other fields allowing select() statements and notification
 528  * on data availability to be implemented.
 529  *
 530  * Data stored in a socket buffer is maintained as a list of records.
 531  * Each record is a list of mbufs chained together with the m_next
 532  * field.  Records are chained together with the m_nextpkt field. The upper
 533  * level routine soreceive() expects the following conventions to be
 534  * observed when placing information in the receive buffer:
 535  *
 536  * 1. If the protocol requires each message be preceded by the sender's
 537  *    name, then a record containing that name must be present before
 538  *    any associated data (mbuf's must be of type MT_SONAME).
 539  * 2. If the protocol supports the exchange of ``access rights'' (really
 540  *    just additional data associated with the message), and there are
 541  *    ``rights'' to be received, then a record containing this data
 542  *    should be present (mbuf's must be of type MT_RIGHTS).
 543  * 3. If a name or rights record exists, then it must be followed by
 544  *    a data record, perhaps of zero length.
 545  *
 546  * Before using a new socket structure it is first necessary to reserve
 547  * buffer space to the socket, by calling sbreserve().  This should commit
 548  * some of the available buffer space in the system buffer pool for the
 549  * socket (currently, it does nothing but enforce limits).  The space
 550  * should be released by calling sbrelease() when the socket is destroyed.
 551  */
 552
 553 int
 554 soreserve(so, sndcc, rcvcc)
 555         register struct socket *so;
 556         u_long sndcc, rcvcc;
 557 {
 558
 559         if (sbreserve(&so->so_snd, sndcc) == 0)
 560                 goto bad;
 561         if (sbreserve(&so->so_rcv, rcvcc) == 0)
 562                 goto bad2;
 563         if (so->so_rcv.sb_lowat == 0)
 564                 so->so_rcv.sb_lowat = 1;
 565         if (so->so_snd.sb_lowat == 0)
 566                 so->so_snd.sb_lowat = MCLBYTES;
 567         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 568                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 569         return (0);
 570 bad2:
 571 #ifdef __APPLE__
 572         selthreadclear(&so->so_snd.sb_sel);
 573 #endif
 574         sbrelease(&so->so_snd);
 575 bad:
 576         return (ENOBUFS);
 577 }
 578
 579 /*
 580  * Allot mbufs to a sockbuf.
 581  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 582  * if buffering efficiency is near the normal case.
 583  */
 584 int
 585 sbreserve(sb, cc)
 586         struct sockbuf *sb;
 587         u_long cc;
 588 {
 589         if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
 590                 return (0);
 591         sb->sb_hiwat = cc;
 592         sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 593         if (sb->sb_lowat > sb->sb_hiwat)
 594                 sb->sb_lowat = sb->sb_hiwat;
 595         return (1);
 596 }
 597
 598 /*
 599  * Free mbufs held by a socket, and reserved mbuf space.
 600  */
 601  /*  WARNING needs to do selthreadclear() before calling this */
 602 void
 603 sbrelease(sb)
 604         struct sockbuf *sb;
 605 {
 606
 607         sbflush(sb);
 608         sb->sb_hiwat = 0;
 609         sb->sb_mbmax = 0;
 610
 611 }
 612
 613 /*
 614  * Routines to add and remove
 615  * data from an mbuf queue.
 616  *
 617  * The routines sbappend() or sbappendrecord() are normally called to
 618  * append new mbufs to a socket buffer, after checking that adequate
 619  * space is available, comparing the function sbspace() with the amount
 620  * of data to be added.  sbappendrecord() differs from sbappend() in
 621  * that data supplied is treated as the beginning of a new record.
 622  * To place a sender's address, optional access rights, and data in a
 623  * socket receive buffer, sbappendaddr() should be used.  To place
 624  * access rights and data in a socket receive buffer, sbappendrights()
 625  * should be used.  In either case, the new data begins a new record.
 626  * Note that unlike sbappend() and sbappendrecord(), these routines check
 627  * for the caller that there will be enough space to store the data.
 628  * Each fails if there is not enough space, or if it cannot find mbufs
 629  * to store additional information in.
 630  *
 631  * Reliable protocols may use the socket send buffer to hold data
 632  * awaiting acknowledgement.  Data is normally copied from a socket
 633  * send buffer in a protocol with m_copy for output to a peer,
 634  * and then removing the data from the socket buffer with sbdrop()
 635  * or sbdroprecord() when the data is acknowledged by the peer.
 636  */
 637
 638 /*
 639  * Append mbuf chain m to the last record in the
 640  * socket buffer sb.  The additional space associated
 641  * the mbuf chain is recorded in sb.  Empty mbufs are
 642  * discarded and mbufs are compacted where possible.
 643  */
 644 int
 645 sbappend(sb, m)
 646         struct sockbuf *sb;
 647         struct mbuf *m;
 648 {
 649         register struct mbuf *n, *sb_first;
 650         int result = 0;
 651         int error = 0;
 652         int     filtered = 0;
 653
 654
 655         KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_START), sb, m->m_len, 0, 0, 0);
 656
 657         if (m == 0)
 658                 return 0;
 659
 660 again:
 661         sb_first = n = sb->sb_mb;
 662         if (n) {
 663                 while (n->m_nextpkt)
 664                         n = n->m_nextpkt;
 665                 do {
 666                         if (n->m_flags & M_EOR) {
 667                                 result = sbappendrecord(sb, m); /* XXXXXX!!!! */
 668                                 KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0);
 669                                 return result;
 670                         }
 671                 } while (n->m_next && (n = n->m_next));
 672         }
 673
 674         if (!filtered && (sb->sb_flags & SB_RECV) != 0) {
 675                 error = sflt_data_in(sb->sb_so, NULL, &m, NULL, 0, &filtered);
 676                 if (error) {
 677                         /* no data was appended, caller should not call sowakeup */
 678                         return 0;
 679                 }
 680
 681                 /*
 682                   If we any filters, the socket lock was dropped. n and sb_first
 683                   cached data from the socket buffer. This cache is not valid
 684                   since we dropped the lock. We must start over. Since filtered
 685                   is set we won't run through the filters a second time. We just
 686                   set n and sb_start again.
 687                 */
 688                 if (filtered)
 689                         goto again;
 690         }
 691
 692         result = sbcompress(sb, m, n);
 693
 694         KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0);
 695
 696         return result;
 697 }
 698
 699 #ifdef SOCKBUF_DEBUG
 700 void
 701 sbcheck(sb)
 702         register struct sockbuf *sb;
 703 {
 704         register struct mbuf *m;
 705         register struct mbuf *n = 0;
 706         register u_long len = 0, mbcnt = 0;
 707         lck_mtx_t *mutex_held;
 708
 709         if (sb->sb_so->so_proto->pr_getlock != NULL)
 710                 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
 711         else
 712                 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
 713
 714         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 715
 716         if (sbchecking == 0)
 717                 return;
 718
 719         for (m = sb->sb_mb; m; m = n) {
 720             n = m->m_nextpkt;
 721             for (; m; m = m->m_next) {
 722                 len += m->m_len;
 723                 mbcnt += MSIZE;
 724                 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 725                     mbcnt += m->m_ext.ext_size;
 726             }
 727         }
 728         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
 729                 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
 730                     mbcnt, sb->sb_mbcnt);
 731         }
 732 }
 733 #endif
 734
 735 /*
 736  * As above, except the mbuf chain
 737  * begins a new record.
 738  */
 739 int
 740 sbappendrecord(sb, m0)
 741         register struct sockbuf *sb;
 742         register struct mbuf *m0;
 743 {
 744         register struct mbuf *m;
 745         int result = 0;
 746
 747         if (m0 == 0)
 748                 return 0;
 749
 750         if ((sb->sb_flags & SB_RECV) != 0) {
 751                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, sock_data_filt_flag_record, NULL);
 752                 if (error != 0) {
 753                         if (error != EJUSTRETURN)
 754                                 m_freem(m0);
 755                         return 0;
 756                 }
 757         }
 758
 759         m = sb->sb_mb;
 760         if (m)
 761                 while (m->m_nextpkt)
 762                         m = m->m_nextpkt;
 763         /*
 764          * Put the first mbuf on the queue.
 765          * Note this permits zero length records.
 766          */
 767         sballoc(sb, m0);
 768         if (m)
 769                 m->m_nextpkt = m0;
 770         else
 771                 sb->sb_mb = m0;
 772         m = m0->m_next;
 773         m0->m_next = 0;
 774         if (m && (m0->m_flags & M_EOR)) {
 775                 m0->m_flags &= ~M_EOR;
 776                 m->m_flags |= M_EOR;
 777         }
 778         return sbcompress(sb, m, m0);
 779 }
 780
 781 /*
 782  * As above except that OOB data
 783  * is inserted at the beginning of the sockbuf,
 784  * but after any other OOB data.
 785  */
 786 int
 787 sbinsertoob(sb, m0)
 788         struct sockbuf *sb;
 789         struct mbuf *m0;
 790 {
 791         struct mbuf *m;
 792         struct mbuf **mp;
 793
 794         if (m0 == 0)
 795                 return 0;
 796
 797         if ((sb->sb_flags & SB_RECV) != 0) {
 798                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
 799                                                                  sock_data_filt_flag_oob, NULL);
 800
 801                 if (error) {
 802                         if (error != EJUSTRETURN) {
 803                                 m_freem(m0);
 804                         }
 805                         return 0;
 806                 }
 807         }
 808
 809         for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
 810             m = *mp;
 811             again:
 812                 switch (m->m_type) {
 813
 814                 case MT_OOBDATA:
 815                         continue;               /* WANT next train */
 816
 817                 case MT_CONTROL:
 818                         m = m->m_next;
 819                         if (m)
 820                                 goto again;     /* inspect THIS train further */
 821                 }
 822                 break;
 823         }
 824         /*
 825          * Put the first mbuf on the queue.
 826          * Note this permits zero length records.
 827          */
 828         sballoc(sb, m0);
 829         m0->m_nextpkt = *mp;
 830         *mp = m0;
 831         m = m0->m_next;
 832         m0->m_next = 0;
 833         if (m && (m0->m_flags & M_EOR)) {
 834                 m0->m_flags &= ~M_EOR;
 835                 m->m_flags |= M_EOR;
 836         }
 837         return sbcompress(sb, m, m0);
 838 }
 839
 840 /*
 841  * Append address and data, and optionally, control (ancillary) data
 842  * to the receive queue of a socket.  If present,
 843  * m0 must include a packet header with total length.
 844  * Returns 0 if no space in sockbuf or insufficient mbufs.
 845  */
 846 static int
 847 sbappendaddr_internal(sb, asa, m0, control)
 848         register struct sockbuf *sb;
 849         struct sockaddr *asa;
 850         struct mbuf *m0, *control;
 851 {
 852         register struct mbuf *m, *n;
 853         int space = asa->sa_len;
 854
 855         if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 856                 panic("sbappendaddr");
 857
 858         if (m0)
 859                 space += m0->m_pkthdr.len;
 860         for (n = control; n; n = n->m_next) {
 861                 space += n->m_len;
 862                 if (n->m_next == 0)     /* keep pointer to last control buf */
 863                         break;
 864         }
 865         if (space > sbspace(sb))
 866                 return (0);
 867         if (asa->sa_len > MLEN)
 868                 return (0);
 869         MGET(m, M_DONTWAIT, MT_SONAME);
 870         if (m == 0)
 871                 return (0);
 872         m->m_len = asa->sa_len;
 873         bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
 874         if (n)
 875                 n->m_next = m0;         /* concatenate data to control */
 876         else
 877                 control = m0;
 878         m->m_next = control;
 879         for (n = m; n; n = n->m_next)
 880                 sballoc(sb, n);
 881         n = sb->sb_mb;
 882         if (n) {
 883                 while (n->m_nextpkt)
 884                         n = n->m_nextpkt;
 885                 n->m_nextpkt = m;
 886         } else
 887                 sb->sb_mb = m;
 888         postevent(0,sb,EV_RWBYTES);
 889         return (1);
 890 }
 891
 892 int
 893 sbappendaddr(
 894         struct sockbuf* sb,
 895         struct sockaddr* asa,
 896         struct mbuf *m0,
 897         struct mbuf *control,
 898         int     *error_out)
 899 {
 900         int result = 0;
 901
 902         if (error_out) *error_out = 0;
 903
 904         if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 905                 panic("sbappendaddrorfree");
 906
 907         /* Call socket data in filters */
 908         if ((sb->sb_flags & SB_RECV) != 0) {
 909                 int error;
 910                 error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0, NULL);
 911                 if (error) {
 912                         if (error != EJUSTRETURN) {
 913                                 if (m0) m_freem(m0);
 914                                 if (control) m_freem(control);
 915                                 if (error_out) *error_out = error;
 916                         }
 917                         return 0;
 918                 }
 919         }
 920
 921         result = sbappendaddr_internal(sb, asa, m0, control);
 922         if (result == 0) {
 923                 if (m0) m_freem(m0);
 924                 if (control) m_freem(control);
 925                 if (error_out) *error_out = ENOBUFS;
 926         }
 927
 928         return result;
 929 }
 930
 931 static int
 932 sbappendcontrol_internal(sb, m0, control)
 933         struct sockbuf *sb;
 934         struct mbuf *control, *m0;
 935 {
 936         register struct mbuf *m, *n;
 937         int space = 0;
 938
 939         if (control == 0)
 940                 panic("sbappendcontrol");
 941
 942         for (m = control; ; m = m->m_next) {
 943                 space += m->m_len;
 944                 if (m->m_next == 0)
 945                         break;
 946         }
 947         n = m;                  /* save pointer to last control buffer */
 948         for (m = m0; m; m = m->m_next)
 949                 space += m->m_len;
 950         if (space > sbspace(sb))
 951                 return (0);
 952         n->m_next = m0;                 /* concatenate data to control */
 953         for (m = control; m; m = m->m_next)
 954                 sballoc(sb, m);
 955         n = sb->sb_mb;
 956         if (n) {
 957                 while (n->m_nextpkt)
 958                         n = n->m_nextpkt;
 959                 n->m_nextpkt = control;
 960         } else
 961                 sb->sb_mb = control;
 962         postevent(0,sb,EV_RWBYTES);
 963         return (1);
 964 }
 965
 966 int
 967 sbappendcontrol(
 968         struct sockbuf  *sb,
 969         struct mbuf             *m0,
 970         struct mbuf             *control,
 971         int                             *error_out)
 972 {
 973         int result = 0;
 974
 975         if (error_out) *error_out = 0;
 976
 977         if (sb->sb_flags & SB_RECV) {
 978                 int error;
 979                 error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0, NULL);
 980                 if (error) {
 981                         if (error != EJUSTRETURN) {
 982                                 if (m0) m_freem(m0);
 983                                 if (control) m_freem(control);
 984                                 if (error_out) *error_out = error;
 985                         }
 986                         return 0;
 987                 }
 988         }
 989
 990         result = sbappendcontrol_internal(sb, m0, control);
 991         if (result == 0) {
 992                 if (m0) m_freem(m0);
 993                 if (control) m_freem(control);
 994                 if (error_out) *error_out = ENOBUFS;
 995         }
 996
 997         return result;
 998 }
 999
1000 /*
1001  * Compress mbuf chain m into the socket
1002  * buffer sb following mbuf n.  If n
1003  * is null, the buffer is presumed empty.
1004  */
1005 static int
1006 sbcompress(sb, m, n)
1007         register struct sockbuf *sb;
1008         register struct mbuf *m, *n;
1009 {
1010         register int eor = 0;
1011         register struct mbuf *o;
1012
1013         while (m) {
1014                 eor |= m->m_flags & M_EOR;
1015                 if (m->m_len == 0 &&
1016                     (eor == 0 ||
1017                      (((o = m->m_next) || (o = n)) &&
1018                       o->m_type == m->m_type))) {
1019                         m = m_free(m);
1020                         continue;
1021                 }
1022                 if (n && (n->m_flags & M_EOR) == 0 &&
1023 #ifndef __APPLE__
1024                     M_WRITABLE(n) &&
1025 #endif
1026                     m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1027                     m->m_len <= M_TRAILINGSPACE(n) &&
1028                     n->m_type == m->m_type) {
1029                         bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1030                             (unsigned)m->m_len);
1031                         n->m_len += m->m_len;
1032                         sb->sb_cc += m->m_len;
1033                         m = m_free(m);
1034                         continue;
1035                 }
1036                 if (n)
1037                         n->m_next = m;
1038                 else
1039                         sb->sb_mb = m;
1040                 sballoc(sb, m);
1041                 n = m;
1042                 m->m_flags &= ~M_EOR;
1043                 m = m->m_next;
1044                 n->m_next = 0;
1045         }
1046         if (eor) {
1047                 if (n)
1048                         n->m_flags |= eor;
1049                 else
1050                         printf("semi-panic: sbcompress\n");
1051         }
1052         postevent(0,sb, EV_RWBYTES);
1053         return 1;
1054 }
1055
1056 /*
1057  * Free all mbufs in a sockbuf.
1058  * Check that all resources are reclaimed.
1059  */
1060 void
1061 sbflush(sb)
1062         register struct sockbuf *sb;
1063 {
1064         if (sb->sb_so == NULL)
1065                 panic ("sbflush sb->sb_so already null sb=%x\n", sb);
1066         (void)sblock(sb, M_WAIT);
1067         while (sb->sb_mbcnt) {
1068                 /*
1069                  * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1070                  * we would loop forever. Panic instead.
1071                  */
1072                 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
1073                         break;
1074                 sbdrop(sb, (int)sb->sb_cc);
1075         }
1076         if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_so == NULL)
1077                 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt, sb->sb_so);
1078
1079         postevent(0, sb, EV_RWBYTES);
1080         sbunlock(sb, 1);        /* keep socket locked */
1081
1082 }
1083
1084 /*
1085  * Drop data from (the front of) a sockbuf.
1086  * use m_freem_list to free the mbuf structures
1087  * under a single lock... this is done by pruning
1088  * the top of the tree from the body by keeping track
1089  * of where we get to in the tree and then zeroing the
1090  * two pertinent pointers m_nextpkt and m_next
1091  * the socket buffer is then updated to point at the new
1092  * top of the tree and the pruned area is released via
1093  * m_freem_list.
1094  */
1095 void
1096 sbdrop(sb, len)
1097         register struct sockbuf *sb;
1098         register int len;
1099 {
1100         register struct mbuf *m, *free_list, *ml;
1101         struct mbuf *next, *last;
1102
1103         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
1104
1105         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1106         free_list = last = m;
1107         ml = (struct mbuf *)0;
1108
1109         while (len > 0) {
1110                 if (m == 0) {
1111                   if (next == 0) {
1112                     /* temporarily replacing this panic with printf because
1113                      * it occurs occasionally when closing a socket when there
1114                      * is no harm in ignoring it.  This problem will be investigated
1115                      * further.
1116                      */
1117                     /* panic("sbdrop"); */
1118                     printf("sbdrop - count not zero\n");
1119                     len = 0;
1120                     /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1121                     sb->sb_cc = 0;
1122                     sb->sb_mbcnt = 0;
1123                     break;
1124                   }
1125                   m = last = next;
1126                   next = m->m_nextpkt;
1127                   continue;
1128                 }
1129                 if (m->m_len > len) {
1130                         m->m_len -= len;
1131                         m->m_data += len;
1132                         sb->sb_cc -= len;
1133                         break;
1134                 }
1135                 len -= m->m_len;
1136                 sbfree(sb, m);
1137
1138                 ml = m;
1139                 m = m->m_next;
1140         }
1141         while (m && m->m_len == 0) {
1142                 sbfree(sb, m);
1143
1144                 ml = m;
1145                 m = m->m_next;
1146         }
1147         if (ml) {
1148                 ml->m_next = (struct mbuf *)0;
1149                 last->m_nextpkt = (struct mbuf *)0;
1150                 m_freem_list(free_list);
1151         }
1152         if (m) {
1153                 sb->sb_mb = m;
1154                 m->m_nextpkt = next;
1155         } else
1156                 sb->sb_mb = next;
1157
1158         postevent(0, sb, EV_RWBYTES);
1159
1160         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
1161 }
1162
1163 /*
1164  * Drop a record off the front of a sockbuf
1165  * and move the next record to the front.
1166  */
1167 void
1168 sbdroprecord(sb)
1169         register struct sockbuf *sb;
1170 {
1171         register struct mbuf *m, *mn;
1172
1173         m = sb->sb_mb;
1174         if (m) {
1175                 sb->sb_mb = m->m_nextpkt;
1176                 do {
1177                         sbfree(sb, m);
1178                         MFREE(m, mn);
1179                         m = mn;
1180                 } while (m);
1181         }
1182         postevent(0, sb, EV_RWBYTES);
1183 }
1184
1185 /*
1186  * Create a "control" mbuf containing the specified data
1187  * with the specified type for presentation on a socket buffer.
1188  */
1189 struct mbuf *
1190 sbcreatecontrol(p, size, type, level)
1191         caddr_t p;
1192         register int size;
1193         int type, level;
1194 {
1195         register struct cmsghdr *cp;
1196         struct mbuf *m;
1197
1198         if (CMSG_SPACE((u_int)size) > MLEN)
1199                 return ((struct mbuf *) NULL);
1200         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1201                 return ((struct mbuf *) NULL);
1202         cp = mtod(m, struct cmsghdr *);
1203         /* XXX check size? */
1204         (void)memcpy(CMSG_DATA(cp), p, size);
1205         m->m_len = CMSG_SPACE(size);
1206         cp->cmsg_len = CMSG_LEN(size);
1207         cp->cmsg_level = level;
1208         cp->cmsg_type = type;
1209         return (m);
1210 }
1211
1212 /*
1213  * Some routines that return EOPNOTSUPP for entry points that are not
1214  * supported by a protocol.  Fill in as needed.
1215  */
1216 int
1217 pru_abort_notsupp(struct socket *so)
1218 {
1219         return EOPNOTSUPP;
1220 }
1221
1222
1223 int
1224 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
1225 {
1226         return EOPNOTSUPP;
1227 }
1228
1229 int
1230 pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
1231 {
1232         return EOPNOTSUPP;
1233 }
1234
1235 int
1236 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1237 {
1238         return EOPNOTSUPP;
1239 }
1240
1241 int
1242 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1243 {
1244         return EOPNOTSUPP;
1245 }
1246
1247 int
1248 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
1249 {
1250         return EOPNOTSUPP;
1251 }
1252
1253 int
1254 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
1255                     struct ifnet *ifp, struct proc *p)
1256 {
1257         return EOPNOTSUPP;
1258 }
1259
1260 int
1261 pru_detach_notsupp(struct socket *so)
1262 {
1263         return EOPNOTSUPP;
1264 }
1265
1266 int
1267 pru_disconnect_notsupp(struct socket *so)
1268 {
1269         return EOPNOTSUPP;
1270 }
1271
1272 int
1273 pru_listen_notsupp(struct socket *so, struct proc *p)
1274 {
1275         return EOPNOTSUPP;
1276 }
1277
1278 int
1279 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
1280 {
1281         return EOPNOTSUPP;
1282 }
1283
1284 int
1285 pru_rcvd_notsupp(struct socket *so, int flags)
1286 {
1287         return EOPNOTSUPP;
1288 }
1289
1290 int
1291 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
1292 {
1293         return EOPNOTSUPP;
1294 }
1295
1296 int
1297 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
1298                  struct sockaddr *addr, struct mbuf *control,
1299                  struct proc *p)
1300
1301 {
1302         return EOPNOTSUPP;
1303 }
1304
1305
1306 /*
1307  * This isn't really a ``null'' operation, but it's the default one
1308  * and doesn't do anything destructive.
1309  */
1310 int
1311 pru_sense_null(struct socket *so, struct stat *sb)
1312 {
1313         sb->st_blksize = so->so_snd.sb_hiwat;
1314         return 0;
1315 }
1316
1317
1318 int     pru_sosend_notsupp(struct socket *so, struct sockaddr *addr,
1319                    struct uio *uio, struct mbuf *top,
1320                    struct mbuf *control, int flags)
1321
1322 {
1323     return EOPNOTSUPP;
1324 }
1325
1326 int     pru_soreceive_notsupp(struct socket *so,
1327                       struct sockaddr **paddr,
1328                       struct uio *uio, struct mbuf **mp0,
1329                       struct mbuf **controlp, int *flagsp)
1330 {
1331     return EOPNOTSUPP;
1332 }
1333
1334 int
1335
1336 pru_shutdown_notsupp(struct socket *so)
1337 {
1338         return EOPNOTSUPP;
1339 }
1340
1341 int
1342 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
1343 {
1344         return EOPNOTSUPP;
1345 }
1346
1347 int     pru_sosend(struct socket *so, struct sockaddr *addr,
1348                    struct uio *uio, struct mbuf *top,
1349                    struct mbuf *control, int flags)
1350 {
1351         return EOPNOTSUPP;
1352 }
1353
1354 int     pru_soreceive(struct socket *so,
1355                       struct sockaddr **paddr,
1356                       struct uio *uio, struct mbuf **mp0,
1357                       struct mbuf **controlp, int *flagsp)
1358 {
1359         return EOPNOTSUPP;
1360 }
1361
1362
1363 int
1364 pru_sopoll_notsupp(__unused struct socket *so, __unused int events,
1365                    __unused kauth_cred_t cred, __unused void *wql)
1366 {
1367     return EOPNOTSUPP;
1368 }
1369
1370
1371 #ifdef __APPLE__
1372 /*
1373  * The following are macros on BSD and functions on Darwin
1374  */
1375
1376 /*
1377  * Do we need to notify the other side when I/O is possible?
1378  */
1379
1380 int
1381 sb_notify(struct sockbuf *sb)
1382 {
1383         return ((sb->sb_flags & (SB_WAIT|SB_SEL|SB_ASYNC|SB_UPCALL|SB_KNOTE)) != 0);
1384 }
1385
1386 /*
1387  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1388  * This is problematical if the fields are unsigned, as the space might
1389  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
1390  * overflow and return 0.  Should use "lmin" but it doesn't exist now.
1391  */
1392 long
1393 sbspace(struct sockbuf *sb)
1394 {
1395     return ((long) imin((int)(sb->sb_hiwat - sb->sb_cc),
1396          (int)(sb->sb_mbmax - sb->sb_mbcnt)));
1397 }
1398
1399 /* do we have to send all at once on a socket? */
1400 int
1401 sosendallatonce(struct socket *so)
1402 {
1403     return (so->so_proto->pr_flags & PR_ATOMIC);
1404 }
1405
1406 /* can we read something from so? */
1407 int
1408 soreadable(struct socket *so)
1409 {
1410     return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1411         (so->so_state & SS_CANTRCVMORE) ||
1412         so->so_comp.tqh_first || so->so_error);
1413 }
1414
1415 /* can we write something to so? */
1416
1417 int
1418 sowriteable(struct socket *so)
1419 {
1420     return ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat &&
1421         ((so->so_state&SS_ISCONNECTED) ||
1422           (so->so_proto->pr_flags&PR_CONNREQUIRED)==0)) ||
1423      (so->so_state & SS_CANTSENDMORE) ||
1424      so->so_error);
1425 }
1426
1427 /* adjust counters in sb reflecting allocation of m */
1428
1429 void
1430 sballoc(struct sockbuf *sb, struct mbuf *m)
1431 {
1432         sb->sb_cc += m->m_len;
1433         sb->sb_mbcnt += MSIZE;
1434         if (m->m_flags & M_EXT)
1435                 sb->sb_mbcnt += m->m_ext.ext_size;
1436 }
1437
1438 /* adjust counters in sb reflecting freeing of m */
1439 void
1440 sbfree(struct sockbuf *sb, struct mbuf *m)
1441 {
1442         sb->sb_cc -= m->m_len;
1443         sb->sb_mbcnt -= MSIZE;
1444         if (m->m_flags & M_EXT)
1445                 sb->sb_mbcnt -= m->m_ext.ext_size;
1446 }
1447
1448 /*
1449  * Set lock on sockbuf sb; sleep if lock is already held.
1450  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1451  * Returns error without lock if sleep is interrupted.
1452  */
1453 int
1454 sblock(struct sockbuf *sb, int wf)
1455 {
1456         return(sb->sb_flags & SB_LOCK ?
1457                 ((wf == M_WAIT) ? sb_lock(sb) : EWOULDBLOCK) :
1458                 (sb->sb_flags |= SB_LOCK), 0);
1459 }
1460
1461 /* release lock on sockbuf sb */
1462 void
1463 sbunlock(struct sockbuf *sb, int keeplocked)
1464 {
1465         struct socket *so = sb->sb_so;
1466         int lr, lr_saved;
1467         lck_mtx_t *mutex_held;
1468
1469 #ifdef __ppc__
1470         __asm__ volatile("mflr %0" : "=r" (lr));
1471         lr_saved = lr;
1472 #endif
1473         sb->sb_flags &= ~SB_LOCK;
1474
1475         if (so->so_proto->pr_getlock != NULL)
1476                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1477         else
1478                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1479
1480         if (keeplocked == 0)
1481                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1482
1483         if (sb->sb_flags & SB_WANT) {
1484                 sb->sb_flags &= ~SB_WANT;
1485                 if (so->so_usecount < 0)
1486                         panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb->sb_so, so->so_usecount, lr_saved, sb->sb_flags);
1487
1488                 wakeup((caddr_t)&(sb)->sb_flags);
1489         }
1490         if (keeplocked == 0) {  /* unlock on exit */
1491                 so->so_usecount--;
1492                 if (so->so_usecount < 0)
1493                         panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so, so->so_usecount,lr_saved, sb->sb_flags);
1494                 so->reserved4= lr_saved;
1495                 lck_mtx_unlock(mutex_held);
1496         }
1497 }
1498
1499 void
1500 sorwakeup(struct socket * so)
1501 {
1502   if (sb_notify(&so->so_rcv))
1503         sowakeup(so, &so->so_rcv);
1504 }
1505
1506 void
1507 sowwakeup(struct socket * so)
1508 {
1509   if (sb_notify(&so->so_snd))
1510         sowakeup(so, &so->so_snd);
1511 }
1512 #endif __APPLE__
1513
1514 /*
1515  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1516  */
1517 struct sockaddr *
1518 dup_sockaddr(sa, canwait)
1519         struct sockaddr *sa;
1520         int canwait;
1521 {
1522         struct sockaddr *sa2;
1523
1524         MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
1525                canwait ? M_WAITOK : M_NOWAIT);
1526         if (sa2)
1527                 bcopy(sa, sa2, sa->sa_len);
1528         return sa2;
1529 }
1530
1531 /*
1532  * Create an external-format (``xsocket'') structure using the information
1533  * in the kernel-format socket structure pointed to by so.  This is done
1534  * to reduce the spew of irrelevant information over this interface,
1535  * to isolate user code from changes in the kernel structure, and
1536  * potentially to provide information-hiding if we decide that
1537  * some of this information should be hidden from users.
1538  */
1539 void
1540 sotoxsocket(struct socket *so, struct xsocket *xso)
1541 {
1542         xso->xso_len = sizeof *xso;
1543         xso->xso_so = so;
1544         xso->so_type = so->so_type;
1545         xso->so_options = so->so_options;
1546         xso->so_linger = so->so_linger;
1547         xso->so_state = so->so_state;
1548         xso->so_pcb = so->so_pcb;
1549         if (so->so_proto) {
1550                 xso->xso_protocol = so->so_proto->pr_protocol;
1551                 xso->xso_family = so->so_proto->pr_domain->dom_family;
1552         }
1553         else
1554                 xso->xso_protocol = xso->xso_family = 0;
1555         xso->so_qlen = so->so_qlen;
1556         xso->so_incqlen = so->so_incqlen;
1557         xso->so_qlimit = so->so_qlimit;
1558         xso->so_timeo = so->so_timeo;
1559         xso->so_error = so->so_error;
1560         xso->so_pgid = so->so_pgid;
1561         xso->so_oobmark = so->so_oobmark;
1562         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
1563         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
1564         xso->so_uid = so->so_uid;
1565 }
1566
1567 /*
1568  * This does the same for sockbufs.  Note that the xsockbuf structure,
1569  * since it is always embedded in a socket, does not include a self
1570  * pointer nor a length.  We make this entry point public in case
1571  * some other mechanism needs it.
1572  */
1573 void
1574 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
1575 {
1576         xsb->sb_cc = sb->sb_cc;
1577         xsb->sb_hiwat = sb->sb_hiwat;
1578         xsb->sb_mbcnt = sb->sb_mbcnt;
1579         xsb->sb_mbmax = sb->sb_mbmax;
1580         xsb->sb_lowat = sb->sb_lowat;
1581         xsb->sb_flags = sb->sb_flags;
1582         xsb->sb_timeo = (u_long)(sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick;
1583         if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0)
1584                 xsb->sb_timeo = 1;
1585 }
1586
1587 /*
1588  * Here is the definition of some of the basic objects in the kern.ipc
1589  * branch of the MIB.
1590  */
1591 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
1592
1593 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1594 static int dummy;
1595 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
1596
1597 SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW,
1598     &sb_max, 0, "Maximum socket buffer size");
1599 SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD,
1600     &maxsockets, 0, "Maximum number of sockets avaliable");
1601 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
1602            &sb_efficiency, 0, "");
1603 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
1604