bsd/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/domain.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/malloc.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/mcache.h>
  79 #include <sys/protosw.h>
  80 #include <sys/stat.h>
  81 #include <sys/socket.h>
  82 #include <sys/socketvar.h>
  83 #include <sys/signalvar.h>
  84 #include <sys/sysctl.h>
  85 #include <sys/syslog.h>
  86 #include <sys/unpcb.h>
  87 #include <sys/ev.h>
  88 #include <kern/locks.h>
  89 #include <net/route.h>
  90 #include <net/content_filter.h>
  91 #include <netinet/in.h>
  92 #include <netinet/in_pcb.h>
  93 #include <netinet/tcp_var.h>
  94 #include <sys/kdebug.h>
  95 #include <libkern/OSAtomic.h>
  96
  97 #if CONFIG_MACF
  98 #include <security/mac_framework.h>
  99 #endif
 100
 101 #include <mach/vm_param.h>
 102
 103 #if MPTCP
 104 #include <netinet/mptcp_var.h>
 105 #endif
 106
 107 #define DBG_FNC_SBDROP          NETDBG_CODE(DBG_NETSOCK, 4)
 108 #define DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
 109
 110 SYSCTL_DECL(_kern_ipc);
 111
 112 __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0;
 113 SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort,
 114     CTLFLAG_RW | CTLFLAG_LOCKED, &net_io_policy_throttle_best_effort, 0, "");
 115
 116 static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
 117 static struct socket *sonewconn_internal(struct socket *, int);
 118 static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
 119     struct mbuf *);
 120 static void soevent_ifdenied(struct socket *);
 121
 122 /*
 123  * Primitive routines for operating on sockets and socket buffers
 124  */
 125 static int soqlimitcompat = 1;
 126 static int soqlencomp = 0;
 127
 128 /*
 129  * Based on the number of mbuf clusters configured, high_sb_max and sb_max can
 130  * get scaled up or down to suit that memory configuration. high_sb_max is a
 131  * higher limit on sb_max that is checked when sb_max gets set through sysctl.
 132  */
 133
 134 u_int32_t       sb_max = SB_MAX;                /* XXX should be static */
 135 u_int32_t       high_sb_max = SB_MAX;
 136
 137 static  u_int32_t sb_efficiency = 8;    /* parameter for sbreserve() */
 138 int32_t total_sbmb_cnt __attribute__((aligned(8))) = 0;
 139 int32_t total_sbmb_cnt_floor __attribute__((aligned(8))) = 0;
 140 int32_t total_sbmb_cnt_peak __attribute__((aligned(8))) = 0;
 141 int64_t sbmb_limreached __attribute__((aligned(8))) = 0;
 142
 143 u_int32_t net_io_policy_log = 0;        /* log socket policy changes */
 144 #if CONFIG_PROC_UUID_POLICY
 145 u_int32_t net_io_policy_uuid = 1;       /* enable UUID socket policy */
 146 #endif /* CONFIG_PROC_UUID_POLICY */
 147
 148 /*
 149  * Procedures to manipulate state flags of socket
 150  * and do appropriate wakeups.  Normal sequence from the
 151  * active (originating) side is that soisconnecting() is
 152  * called during processing of connect() call,
 153  * resulting in an eventual call to soisconnected() if/when the
 154  * connection is established.  When the connection is torn down
 155  * soisdisconnecting() is called during processing of disconnect() call,
 156  * and soisdisconnected() is called when the connection to the peer
 157  * is totally severed.  The semantics of these routines are such that
 158  * connectionless protocols can call soisconnected() and soisdisconnected()
 159  * only, bypassing the in-progress calls when setting up a ``connection''
 160  * takes no time.
 161  *
 162  * From the passive side, a socket is created with
 163  * two queues of sockets: so_incomp for connections in progress
 164  * and so_comp for connections already made and awaiting user acceptance.
 165  * As a protocol is preparing incoming connections, it creates a socket
 166  * structure queued on so_incomp by calling sonewconn().  When the connection
 167  * is established, soisconnected() is called, and transfers the
 168  * socket structure to so_comp, making it available to accept().
 169  *
 170  * If a socket is closed with sockets on either
 171  * so_incomp or so_comp, these sockets are dropped.
 172  *
 173  * If higher level protocols are implemented in
 174  * the kernel, the wakeups done here will sometimes
 175  * cause software-interrupt process scheduling.
 176  */
 177 void
 178 soisconnecting(struct socket *so)
 179 {
 180         so->so_state &= ~(SS_ISCONNECTED | SS_ISDISCONNECTING);
 181         so->so_state |= SS_ISCONNECTING;
 182
 183         sflt_notify(so, sock_evt_connecting, NULL);
 184 }
 185
 186 void
 187 soisconnected(struct socket *so)
 188 {
 189         /*
 190          * If socket is subject to filter and is pending initial verdict,
 191          * delay marking socket as connected and do not present the connected
 192          * socket to user just yet.
 193          */
 194         if (cfil_sock_connected_pending_verdict(so)) {
 195                 return;
 196         }
 197
 198         so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING);
 199         so->so_state |= SS_ISCONNECTED;
 200
 201         soreserve_preconnect(so, 0);
 202
 203         sflt_notify(so, sock_evt_connected, NULL);
 204
 205         if (so->so_head != NULL && (so->so_state & SS_INCOMP)) {
 206                 struct socket *head = so->so_head;
 207                 int locked = 0;
 208
 209                 /*
 210                  * Enforce lock order when the protocol has per socket locks
 211                  */
 212                 if (head->so_proto->pr_getlock != NULL) {
 213                         socket_lock(head, 1);
 214                         so_acquire_accept_list(head, so);
 215                         locked = 1;
 216                 }
 217                 if (so->so_head == head && (so->so_state & SS_INCOMP)) {
 218                         so->so_state &= ~SS_INCOMP;
 219                         so->so_state |= SS_COMP;
 220                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 221                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 222                         head->so_incqlen--;
 223
 224                         /*
 225                          * We have to release the accept list in
 226                          * case a socket callback calls sock_accept()
 227                          */
 228                         if (locked != 0) {
 229                                 so_release_accept_list(head);
 230                                 socket_unlock(so, 0);
 231                         }
 232                         postevent(head, 0, EV_RCONN);
 233                         sorwakeup(head);
 234                         wakeup_one((caddr_t)&head->so_timeo);
 235
 236                         if (locked != 0) {
 237                                 socket_unlock(head, 1);
 238                                 socket_lock(so, 0);
 239                         }
 240                 } else if (locked != 0) {
 241                         so_release_accept_list(head);
 242                         socket_unlock(head, 1);
 243                 }
 244         } else {
 245                 postevent(so, 0, EV_WCONN);
 246                 wakeup((caddr_t)&so->so_timeo);
 247                 sorwakeup(so);
 248                 sowwakeup(so);
 249                 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNECTED |
 250                     SO_FILT_HINT_CONNINFO_UPDATED);
 251         }
 252 }
 253
 254 boolean_t
 255 socanwrite(struct socket *so)
 256 {
 257         return (so->so_state & SS_ISCONNECTED) ||
 258                !(so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 259                (so->so_flags1 & SOF1_PRECONNECT_DATA);
 260 }
 261
 262 void
 263 soisdisconnecting(struct socket *so)
 264 {
 265         so->so_state &= ~SS_ISCONNECTING;
 266         so->so_state |= (SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE);
 267         soevent(so, SO_FILT_HINT_LOCKED);
 268         sflt_notify(so, sock_evt_disconnecting, NULL);
 269         wakeup((caddr_t)&so->so_timeo);
 270         sowwakeup(so);
 271         sorwakeup(so);
 272 }
 273
 274 void
 275 soisdisconnected(struct socket *so)
 276 {
 277         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 278         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 279         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 280             SO_FILT_HINT_CONNINFO_UPDATED);
 281         sflt_notify(so, sock_evt_disconnected, NULL);
 282         wakeup((caddr_t)&so->so_timeo);
 283         sowwakeup(so);
 284         sorwakeup(so);
 285
 286 #if CONTENT_FILTER
 287         /* Notify content filters as soon as we cannot send/receive data */
 288         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 289 #endif /* CONTENT_FILTER */
 290 }
 291
 292 /*
 293  * This function will issue a wakeup like soisdisconnected but it will not
 294  * notify the socket filters. This will avoid unlocking the socket
 295  * in the midst of closing it.
 296  */
 297 void
 298 sodisconnectwakeup(struct socket *so)
 299 {
 300         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 301         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 302         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 303             SO_FILT_HINT_CONNINFO_UPDATED);
 304         wakeup((caddr_t)&so->so_timeo);
 305         sowwakeup(so);
 306         sorwakeup(so);
 307
 308 #if CONTENT_FILTER
 309         /* Notify content filters as soon as we cannot send/receive data */
 310         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 311 #endif /* CONTENT_FILTER */
 312 }
 313
 314 /*
 315  * When an attempt at a new connection is noted on a socket
 316  * which accepts connections, sonewconn is called.  If the
 317  * connection is possible (subject to space constraints, etc.)
 318  * then we allocate a new structure, propoerly linked into the
 319  * data structure of the original socket, and return this.
 320  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 321  */
 322 static struct socket *
 323 sonewconn_internal(struct socket *head, int connstatus)
 324 {
 325         int so_qlen, error = 0;
 326         struct socket *so;
 327         lck_mtx_t *mutex_held;
 328
 329         if (head->so_proto->pr_getlock != NULL) {
 330                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 331         } else {
 332                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 333         }
 334         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 335
 336         if (!soqlencomp) {
 337                 /*
 338                  * This is the default case; so_qlen represents the
 339                  * sum of both incomplete and completed queues.
 340                  */
 341                 so_qlen = head->so_qlen;
 342         } else {
 343                 /*
 344                  * When kern.ipc.soqlencomp is set to 1, so_qlen
 345                  * represents only the completed queue.  Since we
 346                  * cannot let the incomplete queue goes unbounded
 347                  * (in case of SYN flood), we cap the incomplete
 348                  * queue length to at most somaxconn, and use that
 349                  * as so_qlen so that we fail immediately below.
 350                  */
 351                 so_qlen = head->so_qlen - head->so_incqlen;
 352                 if (head->so_incqlen > somaxconn) {
 353                         so_qlen = somaxconn;
 354                 }
 355         }
 356
 357         if (so_qlen >=
 358             (soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2))) {
 359                 return (struct socket *)0;
 360         }
 361         so = soalloc(1, SOCK_DOM(head), head->so_type);
 362         if (so == NULL) {
 363                 return (struct socket *)0;
 364         }
 365         /* check if head was closed during the soalloc */
 366         if (head->so_proto == NULL) {
 367                 sodealloc(so);
 368                 return (struct socket *)0;
 369         }
 370
 371         so->so_type = head->so_type;
 372         so->so_options = head->so_options & ~SO_ACCEPTCONN;
 373         so->so_linger = head->so_linger;
 374         so->so_state = head->so_state | SS_NOFDREF;
 375         so->so_proto = head->so_proto;
 376         so->so_timeo = head->so_timeo;
 377         so->so_pgid  = head->so_pgid;
 378         kauth_cred_ref(head->so_cred);
 379         so->so_cred = head->so_cred;
 380         so->last_pid = head->last_pid;
 381         so->last_upid = head->last_upid;
 382         memcpy(so->last_uuid, head->last_uuid, sizeof(so->last_uuid));
 383         if (head->so_flags & SOF_DELEGATED) {
 384                 so->e_pid = head->e_pid;
 385                 so->e_upid = head->e_upid;
 386                 memcpy(so->e_uuid, head->e_uuid, sizeof(so->e_uuid));
 387         }
 388         /* inherit socket options stored in so_flags */
 389         so->so_flags = head->so_flags &
 390             (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID |
 391             SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT |
 392             SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS | SOF_NOTSENT_LOWAT |
 393             SOF_USELRO | SOF_DELEGATED);
 394         so->so_flags1 |= SOF1_INBOUND;
 395         so->so_usecount = 1;
 396         so->next_lock_lr = 0;
 397         so->next_unlock_lr = 0;
 398
 399         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 400         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 401         TAILQ_INIT(&so->so_evlist);
 402
 403 #if CONFIG_MACF_SOCKET
 404         mac_socket_label_associate_accept(head, so);
 405 #endif
 406
 407         /* inherit traffic management properties of listener */
 408         so->so_flags1 |=
 409             head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND | SOF1_TC_NET_SERV_TYPE |
 410             SOF1_QOSMARKING_ALLOWED | SOF1_QOSMARKING_POLICY_OVERRIDE);
 411         so->so_background_thread = head->so_background_thread;
 412         so->so_traffic_class = head->so_traffic_class;
 413         so->so_netsvctype = head->so_netsvctype;
 414
 415         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 416                 sodealloc(so);
 417                 return (struct socket *)0;
 418         }
 419         so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE);
 420         so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE);
 421
 422         /*
 423          * Must be done with head unlocked to avoid deadlock
 424          * for protocol with per socket mutexes.
 425          */
 426         if (head->so_proto->pr_unlock) {
 427                 socket_unlock(head, 0);
 428         }
 429         if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) ||
 430             error) {
 431                 sodealloc(so);
 432                 if (head->so_proto->pr_unlock) {
 433                         socket_lock(head, 0);
 434                 }
 435                 return (struct socket *)0;
 436         }
 437         if (head->so_proto->pr_unlock) {
 438                 socket_lock(head, 0);
 439                 /*
 440                  * Radar 7385998 Recheck that the head is still accepting
 441                  * to avoid race condition when head is getting closed.
 442                  */
 443                 if ((head->so_options & SO_ACCEPTCONN) == 0) {
 444                         so->so_state &= ~SS_NOFDREF;
 445                         soclose(so);
 446                         return (struct socket *)0;
 447                 }
 448         }
 449
 450         if (so->so_proto->pr_copy_last_owner != NULL) {
 451                 (*so->so_proto->pr_copy_last_owner)(so, head);
 452         }
 453         atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1);
 454
 455         /* Insert in head appropriate lists */
 456         so_acquire_accept_list(head, NULL);
 457
 458         so->so_head = head;
 459
 460         /*
 461          * Since this socket is going to be inserted into the incomp
 462          * queue, it can be picked up by another thread in
 463          * tcp_dropdropablreq to get dropped before it is setup..
 464          * To prevent this race, set in-progress flag which can be
 465          * cleared later
 466          */
 467         so->so_flags |= SOF_INCOMP_INPROGRESS;
 468
 469         if (connstatus) {
 470                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 471                 so->so_state |= SS_COMP;
 472         } else {
 473                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 474                 so->so_state |= SS_INCOMP;
 475                 head->so_incqlen++;
 476         }
 477         head->so_qlen++;
 478
 479         so_release_accept_list(head);
 480
 481         /* Attach socket filters for this protocol */
 482         sflt_initsock(so);
 483
 484         if (connstatus) {
 485                 so->so_state |= connstatus;
 486                 sorwakeup(head);
 487                 wakeup((caddr_t)&head->so_timeo);
 488         }
 489         return so;
 490 }
 491
 492
 493 struct socket *
 494 sonewconn(struct socket *head, int connstatus, const struct sockaddr *from)
 495 {
 496         int error = sflt_connectin(head, from);
 497         if (error) {
 498                 return NULL;
 499         }
 500
 501         return sonewconn_internal(head, connstatus);
 502 }
 503
 504 /*
 505  * Socantsendmore indicates that no more data will be sent on the
 506  * socket; it would normally be applied to a socket when the user
 507  * informs the system that no more data is to be sent, by the protocol
 508  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 509  * will be received, and will normally be applied to the socket by a
 510  * protocol when it detects that the peer will send no more data.
 511  * Data queued for reading in the socket may yet be read.
 512  */
 513
 514 void
 515 socantsendmore(struct socket *so)
 516 {
 517         so->so_state |= SS_CANTSENDMORE;
 518         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTSENDMORE);
 519         sflt_notify(so, sock_evt_cantsendmore, NULL);
 520         sowwakeup(so);
 521 }
 522
 523 void
 524 socantrcvmore(struct socket *so)
 525 {
 526         so->so_state |= SS_CANTRCVMORE;
 527         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
 528         sflt_notify(so, sock_evt_cantrecvmore, NULL);
 529         sorwakeup(so);
 530 }
 531
 532 /*
 533  * Wait for data to arrive at/drain from a socket buffer.
 534  */
 535 int
 536 sbwait(struct sockbuf *sb)
 537 {
 538         boolean_t nointr = (sb->sb_flags & SB_NOINTR);
 539         void *lr_saved = __builtin_return_address(0);
 540         struct socket *so = sb->sb_so;
 541         lck_mtx_t *mutex_held;
 542         struct timespec ts;
 543         int error = 0;
 544
 545         if (so == NULL) {
 546                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
 547                     __func__, sb, sb->sb_flags, lr_saved);
 548                 /* NOTREACHED */
 549         } else if (so->so_usecount < 1) {
 550                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 551                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 552                     so->so_usecount, lr_saved, solockhistory_nr(so));
 553                 /* NOTREACHED */
 554         }
 555
 556         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 557                 error = EBADF;
 558                 if (so->so_flags & SOF_DEFUNCT) {
 559                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 560                             "(%d)\n", __func__, proc_selfpid(),
 561                             proc_best_name(current_proc()),
 562                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 563                             SOCK_DOM(so), SOCK_TYPE(so), error);
 564                 }
 565                 return error;
 566         }
 567
 568         if (so->so_proto->pr_getlock != NULL) {
 569                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
 570         } else {
 571                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 572         }
 573
 574         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 575
 576         ts.tv_sec = sb->sb_timeo.tv_sec;
 577         ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
 578
 579         sb->sb_waiters++;
 580         VERIFY(sb->sb_waiters != 0);
 581
 582         error = msleep((caddr_t)&sb->sb_cc, mutex_held,
 583             nointr ? PSOCK : PSOCK | PCATCH,
 584             nointr ? "sbwait_nointr" : "sbwait", &ts);
 585
 586         VERIFY(sb->sb_waiters != 0);
 587         sb->sb_waiters--;
 588
 589         if (so->so_usecount < 1) {
 590                 panic("%s: 2 sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 591                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 592                     so->so_usecount, lr_saved, solockhistory_nr(so));
 593                 /* NOTREACHED */
 594         }
 595
 596         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 597                 error = EBADF;
 598                 if (so->so_flags & SOF_DEFUNCT) {
 599                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 600                             "(%d)\n", __func__, proc_selfpid(),
 601                             proc_best_name(current_proc()),
 602                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 603                             SOCK_DOM(so), SOCK_TYPE(so), error);
 604                 }
 605         }
 606
 607         return error;
 608 }
 609
 610 void
 611 sbwakeup(struct sockbuf *sb)
 612 {
 613         if (sb->sb_waiters > 0) {
 614                 wakeup((caddr_t)&sb->sb_cc);
 615         }
 616 }
 617
 618 /*
 619  * Wakeup processes waiting on a socket buffer.
 620  * Do asynchronous notification via SIGIO
 621  * if the socket has the SS_ASYNC flag set.
 622  */
 623 void
 624 sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2)
 625 {
 626         if (so->so_flags & SOF_DEFUNCT) {
 627                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
 628                     "fl 0x%x [%s]\n", __func__, proc_selfpid(),
 629                     proc_best_name(current_proc()),
 630                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
 631                     SOCK_TYPE(so), (uint32_t)sb->sb_sel.si_flags, sb->sb_flags,
 632                     (sb->sb_flags & SB_RECV) ? "rcv" : "snd");
 633         }
 634
 635         sb->sb_flags &= ~SB_SEL;
 636         selwakeup(&sb->sb_sel);
 637         sbwakeup(sb);
 638         if (so->so_state & SS_ASYNC) {
 639                 if (so->so_pgid < 0) {
 640                         gsignal(-so->so_pgid, SIGIO);
 641                 } else if (so->so_pgid > 0) {
 642                         proc_signal(so->so_pgid, SIGIO);
 643                 }
 644         }
 645         if (sb->sb_flags & SB_KNOTE) {
 646                 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
 647         }
 648         if (sb->sb_flags & SB_UPCALL) {
 649                 void (*sb_upcall)(struct socket *, void *, int);
 650                 caddr_t sb_upcallarg;
 651                 int lock = !(sb->sb_flags & SB_UPCALL_LOCK);
 652
 653                 sb_upcall = sb->sb_upcall;
 654                 sb_upcallarg = sb->sb_upcallarg;
 655                 /* Let close know that we're about to do an upcall */
 656                 so->so_upcallusecount++;
 657
 658                 if (lock) {
 659                         if (so2) {
 660                                 struct unpcb *unp = sotounpcb(so2);
 661                                 unp->unp_flags |= UNP_DONTDISCONNECT;
 662                                 unp->rw_thrcount++;
 663
 664                                 socket_unlock(so2, 0);
 665                         }
 666                         socket_unlock(so, 0);
 667                 }
 668                 (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT);
 669                 if (lock) {
 670                         if (so2 && so > so2) {
 671                                 struct unpcb *unp;
 672                                 socket_lock(so2, 0);
 673
 674                                 unp = sotounpcb(so2);
 675                                 unp->rw_thrcount--;
 676                                 if (unp->rw_thrcount == 0) {
 677                                         unp->unp_flags &= ~UNP_DONTDISCONNECT;
 678                                         wakeup(unp);
 679                                 }
 680                         }
 681
 682                         socket_lock(so, 0);
 683
 684                         if (so2 && so < so2) {
 685                                 struct unpcb *unp;
 686                                 socket_lock(so2, 0);
 687
 688                                 unp = sotounpcb(so2);
 689                                 unp->rw_thrcount--;
 690                                 if (unp->rw_thrcount == 0) {
 691                                         unp->unp_flags &= ~UNP_DONTDISCONNECT;
 692                                         wakeup(unp);
 693                                 }
 694                         }
 695                 }
 696
 697                 so->so_upcallusecount--;
 698                 /* Tell close that it's safe to proceed */
 699                 if ((so->so_flags & SOF_CLOSEWAIT) &&
 700                     so->so_upcallusecount == 0) {
 701                         wakeup((caddr_t)&so->so_upcallusecount);
 702                 }
 703         }
 704 #if CONTENT_FILTER
 705         /*
 706          * Trap disconnection events for content filters
 707          */
 708         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
 709                 if ((sb->sb_flags & SB_RECV)) {
 710                         if (so->so_state & (SS_CANTRCVMORE)) {
 711                                 cfil_sock_notify_shutdown(so, SHUT_RD);
 712                         }
 713                 } else {
 714                         if (so->so_state & (SS_CANTSENDMORE)) {
 715                                 cfil_sock_notify_shutdown(so, SHUT_WR);
 716                         }
 717                 }
 718         }
 719 #endif /* CONTENT_FILTER */
 720 }
 721
 722 /*
 723  * Socket buffer (struct sockbuf) utility routines.
 724  *
 725  * Each socket contains two socket buffers: one for sending data and
 726  * one for receiving data.  Each buffer contains a queue of mbufs,
 727  * information about the number of mbufs and amount of data in the
 728  * queue, and other fields allowing select() statements and notification
 729  * on data availability to be implemented.
 730  *
 731  * Data stored in a socket buffer is maintained as a list of records.
 732  * Each record is a list of mbufs chained together with the m_next
 733  * field.  Records are chained together with the m_nextpkt field. The upper
 734  * level routine soreceive() expects the following conventions to be
 735  * observed when placing information in the receive buffer:
 736  *
 737  * 1. If the protocol requires each message be preceded by the sender's
 738  *    name, then a record containing that name must be present before
 739  *    any associated data (mbuf's must be of type MT_SONAME).
 740  * 2. If the protocol supports the exchange of ``access rights'' (really
 741  *    just additional data associated with the message), and there are
 742  *    ``rights'' to be received, then a record containing this data
 743  *    should be present (mbuf's must be of type MT_RIGHTS).
 744  * 3. If a name or rights record exists, then it must be followed by
 745  *    a data record, perhaps of zero length.
 746  *
 747  * Before using a new socket structure it is first necessary to reserve
 748  * buffer space to the socket, by calling sbreserve().  This should commit
 749  * some of the available buffer space in the system buffer pool for the
 750  * socket (currently, it does nothing but enforce limits).  The space
 751  * should be released by calling sbrelease() when the socket is destroyed.
 752  */
 753
 754 /*
 755  * Returns:     0                       Success
 756  *              ENOBUFS
 757  */
 758 int
 759 soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc)
 760 {
 761         /*
 762          * We do not want to fail the creation of a socket
 763          * when kern.ipc.maxsockbuf is less than the
 764          * default socket buffer socket size of the protocol
 765          * so force the buffer sizes to be at most the
 766          * limit enforced by sbreserve()
 767          */
 768         uint64_t maxcc = (uint64_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 769         if (sndcc > maxcc) {
 770                 sndcc = maxcc;
 771         }
 772         if (rcvcc > maxcc) {
 773                 rcvcc = maxcc;
 774         }
 775         if (sbreserve(&so->so_snd, sndcc) == 0) {
 776                 goto bad;
 777         } else {
 778                 so->so_snd.sb_idealsize = sndcc;
 779         }
 780
 781         if (sbreserve(&so->so_rcv, rcvcc) == 0) {
 782                 goto bad2;
 783         } else {
 784                 so->so_rcv.sb_idealsize = rcvcc;
 785         }
 786
 787         if (so->so_rcv.sb_lowat == 0) {
 788                 so->so_rcv.sb_lowat = 1;
 789         }
 790         if (so->so_snd.sb_lowat == 0) {
 791                 so->so_snd.sb_lowat = MCLBYTES;
 792         }
 793         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) {
 794                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 795         }
 796         return 0;
 797 bad2:
 798         so->so_snd.sb_flags &= ~SB_SEL;
 799         selthreadclear(&so->so_snd.sb_sel);
 800         sbrelease(&so->so_snd);
 801 bad:
 802         return ENOBUFS;
 803 }
 804
 805 void
 806 soreserve_preconnect(struct socket *so, unsigned int pre_cc)
 807 {
 808         /* As of now, same bytes for both preconnect read and write */
 809         so->so_snd.sb_preconn_hiwat = pre_cc;
 810         so->so_rcv.sb_preconn_hiwat = pre_cc;
 811 }
 812
 813 /*
 814  * Allot mbufs to a sockbuf.
 815  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 816  * if buffering efficiency is near the normal case.
 817  */
 818 int
 819 sbreserve(struct sockbuf *sb, u_int32_t cc)
 820 {
 821         if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) {
 822                 return 0;
 823         }
 824         sb->sb_hiwat = cc;
 825         sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 826         if (sb->sb_lowat > sb->sb_hiwat) {
 827                 sb->sb_lowat = sb->sb_hiwat;
 828         }
 829         return 1;
 830 }
 831
 832 /*
 833  * Free mbufs held by a socket, and reserved mbuf space.
 834  */
 835 /*  WARNING needs to do selthreadclear() before calling this */
 836 void
 837 sbrelease(struct sockbuf *sb)
 838 {
 839         sbflush(sb);
 840         sb->sb_hiwat = 0;
 841         sb->sb_mbmax = 0;
 842 }
 843
 844 /*
 845  * Routines to add and remove
 846  * data from an mbuf queue.
 847  *
 848  * The routines sbappend() or sbappendrecord() are normally called to
 849  * append new mbufs to a socket buffer, after checking that adequate
 850  * space is available, comparing the function sbspace() with the amount
 851  * of data to be added.  sbappendrecord() differs from sbappend() in
 852  * that data supplied is treated as the beginning of a new record.
 853  * To place a sender's address, optional access rights, and data in a
 854  * socket receive buffer, sbappendaddr() should be used.  To place
 855  * access rights and data in a socket receive buffer, sbappendrights()
 856  * should be used.  In either case, the new data begins a new record.
 857  * Note that unlike sbappend() and sbappendrecord(), these routines check
 858  * for the caller that there will be enough space to store the data.
 859  * Each fails if there is not enough space, or if it cannot find mbufs
 860  * to store additional information in.
 861  *
 862  * Reliable protocols may use the socket send buffer to hold data
 863  * awaiting acknowledgement.  Data is normally copied from a socket
 864  * send buffer in a protocol with m_copy for output to a peer,
 865  * and then removing the data from the socket buffer with sbdrop()
 866  * or sbdroprecord() when the data is acknowledged by the peer.
 867  */
 868
 869 /*
 870  * Append mbuf chain m to the last record in the
 871  * socket buffer sb.  The additional space associated
 872  * the mbuf chain is recorded in sb.  Empty mbufs are
 873  * discarded and mbufs are compacted where possible.
 874  */
 875 int
 876 sbappend(struct sockbuf *sb, struct mbuf *m)
 877 {
 878         struct socket *so = sb->sb_so;
 879
 880         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 881                 if (m != NULL) {
 882                         m_freem(m);
 883                 }
 884                 return 0;
 885         }
 886
 887         SBLASTRECORDCHK(sb, "sbappend 1");
 888
 889         if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) {
 890                 return sbappendrecord(sb, m);
 891         }
 892
 893         if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 894                 int error = sflt_data_in(so, NULL, &m, NULL, 0);
 895                 SBLASTRECORDCHK(sb, "sbappend 2");
 896
 897 #if CONTENT_FILTER
 898                 if (error == 0) {
 899                         error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 900                 }
 901 #endif /* CONTENT_FILTER */
 902
 903                 if (error != 0) {
 904                         if (error != EJUSTRETURN) {
 905                                 m_freem(m);
 906                         }
 907                         return 0;
 908                 }
 909         } else if (m) {
 910                 m->m_flags &= ~M_SKIPCFIL;
 911         }
 912
 913         /* If this is the first record, it's also the last record */
 914         if (sb->sb_lastrecord == NULL) {
 915                 sb->sb_lastrecord = m;
 916         }
 917
 918         sbcompress(sb, m, sb->sb_mbtail);
 919         SBLASTRECORDCHK(sb, "sbappend 3");
 920         return 1;
 921 }
 922
 923 /*
 924  * Similar to sbappend, except that this is optimized for stream sockets.
 925  */
 926 int
 927 sbappendstream(struct sockbuf *sb, struct mbuf *m)
 928 {
 929         struct socket *so = sb->sb_so;
 930
 931         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 932                 if (m != NULL) {
 933                         m_freem(m);
 934                 }
 935                 return 0;
 936         }
 937
 938         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
 939                 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
 940                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
 941                 /* NOTREACHED */
 942         }
 943
 944         SBLASTMBUFCHK(sb, __func__);
 945
 946         if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 947                 int error = sflt_data_in(so, NULL, &m, NULL, 0);
 948                 SBLASTRECORDCHK(sb, "sbappendstream 1");
 949
 950 #if CONTENT_FILTER
 951                 if (error == 0) {
 952                         error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 953                 }
 954 #endif /* CONTENT_FILTER */
 955
 956                 if (error != 0) {
 957                         if (error != EJUSTRETURN) {
 958                                 m_freem(m);
 959                         }
 960                         return 0;
 961                 }
 962         } else if (m) {
 963                 m->m_flags &= ~M_SKIPCFIL;
 964         }
 965
 966         sbcompress(sb, m, sb->sb_mbtail);
 967         sb->sb_lastrecord = sb->sb_mb;
 968         SBLASTRECORDCHK(sb, "sbappendstream 2");
 969         return 1;
 970 }
 971
 972 #ifdef SOCKBUF_DEBUG
 973 void
 974 sbcheck(struct sockbuf *sb)
 975 {
 976         struct mbuf *m;
 977         struct mbuf *n = 0;
 978         u_int32_t len = 0, mbcnt = 0;
 979         lck_mtx_t *mutex_held;
 980
 981         if (sb->sb_so->so_proto->pr_getlock != NULL) {
 982                 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
 983         } else {
 984                 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
 985         }
 986
 987         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 988
 989         if (sbchecking == 0) {
 990                 return;
 991         }
 992
 993         for (m = sb->sb_mb; m; m = n) {
 994                 n = m->m_nextpkt;
 995                 for (; m; m = m->m_next) {
 996                         len += m->m_len;
 997                         mbcnt += MSIZE;
 998                         /* XXX pretty sure this is bogus */
 999                         if (m->m_flags & M_EXT) {
1000                                 mbcnt += m->m_ext.ext_size;
1001                         }
1002                 }
1003         }
1004         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
1005                 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
1006                     mbcnt, sb->sb_mbcnt);
1007         }
1008 }
1009 #endif
1010
1011 void
1012 sblastrecordchk(struct sockbuf *sb, const char *where)
1013 {
1014         struct mbuf *m = sb->sb_mb;
1015
1016         while (m && m->m_nextpkt) {
1017                 m = m->m_nextpkt;
1018         }
1019
1020         if (m != sb->sb_lastrecord) {
1021                 printf("sblastrecordchk: mb 0x%llx lastrecord 0x%llx "
1022                     "last 0x%llx\n",
1023                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
1024                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_lastrecord),
1025                     (uint64_t)VM_KERNEL_ADDRPERM(m));
1026                 printf("packet chain:\n");
1027                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
1028                         printf("\t0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(m));
1029                 }
1030                 panic("sblastrecordchk from %s", where);
1031         }
1032 }
1033
1034 void
1035 sblastmbufchk(struct sockbuf *sb, const char *where)
1036 {
1037         struct mbuf *m = sb->sb_mb;
1038         struct mbuf *n;
1039
1040         while (m && m->m_nextpkt) {
1041                 m = m->m_nextpkt;
1042         }
1043
1044         while (m && m->m_next) {
1045                 m = m->m_next;
1046         }
1047
1048         if (m != sb->sb_mbtail) {
1049                 printf("sblastmbufchk: mb 0x%llx mbtail 0x%llx last 0x%llx\n",
1050                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
1051                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mbtail),
1052                     (uint64_t)VM_KERNEL_ADDRPERM(m));
1053                 printf("packet tree:\n");
1054                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
1055                         printf("\t");
1056                         for (n = m; n != NULL; n = n->m_next) {
1057                                 printf("0x%llx ",
1058                                     (uint64_t)VM_KERNEL_ADDRPERM(n));
1059                         }
1060                         printf("\n");
1061                 }
1062                 panic("sblastmbufchk from %s", where);
1063         }
1064 }
1065
1066 /*
1067  * Similar to sbappend, except the mbuf chain begins a new record.
1068  */
1069 int
1070 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1071 {
1072         struct mbuf *m;
1073         int space = 0;
1074
1075         if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
1076                 if (m0 != NULL) {
1077                         m_freem(m0);
1078                 }
1079                 return 0;
1080         }
1081
1082         for (m = m0; m != NULL; m = m->m_next) {
1083                 space += m->m_len;
1084         }
1085
1086         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1087                 m_freem(m0);
1088                 return 0;
1089         }
1090
1091         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1092                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
1093                     sock_data_filt_flag_record);
1094
1095 #if CONTENT_FILTER
1096                 if (error == 0) {
1097                         error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
1098                 }
1099 #endif /* CONTENT_FILTER */
1100
1101                 if (error != 0) {
1102                         SBLASTRECORDCHK(sb, "sbappendrecord 1");
1103                         if (error != EJUSTRETURN) {
1104                                 m_freem(m0);
1105                         }
1106                         return 0;
1107                 }
1108         } else if (m0) {
1109                 m0->m_flags &= ~M_SKIPCFIL;
1110         }
1111
1112         /*
1113          * Note this permits zero length records.
1114          */
1115         sballoc(sb, m0);
1116         SBLASTRECORDCHK(sb, "sbappendrecord 2");
1117         if (sb->sb_lastrecord != NULL) {
1118                 sb->sb_lastrecord->m_nextpkt = m0;
1119         } else {
1120                 sb->sb_mb = m0;
1121         }
1122         sb->sb_lastrecord = m0;
1123         sb->sb_mbtail = m0;
1124
1125         m = m0->m_next;
1126         m0->m_next = 0;
1127         if (m && (m0->m_flags & M_EOR)) {
1128                 m0->m_flags &= ~M_EOR;
1129                 m->m_flags |= M_EOR;
1130         }
1131         sbcompress(sb, m, m0);
1132         SBLASTRECORDCHK(sb, "sbappendrecord 3");
1133         return 1;
1134 }
1135
1136 /*
1137  * Concatenate address (optional), control (optional) and data into one
1138  * single mbuf chain.  If sockbuf *sb is passed in, space check will be
1139  * performed.
1140  *
1141  * Returns:     mbuf chain pointer if succeeded, NULL if failed
1142  */
1143 struct mbuf *
1144 sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control)
1145 {
1146         struct mbuf *m = NULL, *n = NULL;
1147         int space = 0;
1148
1149         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1150                 panic("sbconcat_mbufs");
1151         }
1152
1153         if (m0) {
1154                 space += m0->m_pkthdr.len;
1155         }
1156         for (n = control; n; n = n->m_next) {
1157                 space += n->m_len;
1158                 if (n->m_next == 0) {   /* keep pointer to last control buf */
1159                         break;
1160                 }
1161         }
1162
1163         if (asa != NULL) {
1164                 if (asa->sa_len > MLEN) {
1165                         return NULL;
1166                 }
1167                 space += asa->sa_len;
1168         }
1169
1170         if (sb != NULL && space > sbspace(sb)) {
1171                 return NULL;
1172         }
1173
1174         if (n) {
1175                 n->m_next = m0;         /* concatenate data to control */
1176         } else {
1177                 control = m0;
1178         }
1179
1180         if (asa != NULL) {
1181                 MGET(m, M_DONTWAIT, MT_SONAME);
1182                 if (m == 0) {
1183                         if (n) {
1184                                 /* unchain control and data if necessary */
1185                                 n->m_next = NULL;
1186                         }
1187                         return NULL;
1188                 }
1189                 m->m_len = asa->sa_len;
1190                 bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
1191
1192                 m->m_next = control;
1193         } else {
1194                 m = control;
1195         }
1196
1197         return m;
1198 }
1199
1200 /*
1201  * Queue mbuf chain to the receive queue of a socket.
1202  * Parameter space is the total len of the mbuf chain.
1203  * If passed in, sockbuf space will be checked.
1204  *
1205  * Returns:     0               Invalid mbuf chain
1206  *                      1               Success
1207  */
1208 int
1209 sbappendchain(struct sockbuf *sb, struct mbuf *m, int space)
1210 {
1211         struct mbuf *n, *nlast;
1212
1213         if (m == NULL) {
1214                 return 0;
1215         }
1216
1217         if (space != 0 && space > sbspace(sb)) {
1218                 return 0;
1219         }
1220
1221         for (n = m; n->m_next != NULL; n = n->m_next) {
1222                 sballoc(sb, n);
1223         }
1224         sballoc(sb, n);
1225         nlast = n;
1226
1227         if (sb->sb_lastrecord != NULL) {
1228                 sb->sb_lastrecord->m_nextpkt = m;
1229         } else {
1230                 sb->sb_mb = m;
1231         }
1232         sb->sb_lastrecord = m;
1233         sb->sb_mbtail = nlast;
1234
1235         SBLASTMBUFCHK(sb, __func__);
1236         SBLASTRECORDCHK(sb, "sbappendadddr 2");
1237
1238         postevent(0, sb, EV_RWBYTES);
1239         return 1;
1240 }
1241
1242 /*
1243  * Returns:     0                       Error: No space/out of mbufs/etc.
1244  *              1                       Success
1245  *
1246  * Imputed:     (*error_out)            errno for error
1247  *              ENOBUFS
1248  *      sflt_data_in:???                [whatever a filter author chooses]
1249  */
1250 int
1251 sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
1252     struct mbuf *control, int *error_out)
1253 {
1254         int result = 0;
1255         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1256         struct mbuf *mbuf_chain = NULL;
1257
1258         if (error_out) {
1259                 *error_out = 0;
1260         }
1261
1262         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1263                 panic("sbappendaddrorfree");
1264         }
1265
1266         if (sb->sb_flags & SB_DROP) {
1267                 if (m0 != NULL) {
1268                         m_freem(m0);
1269                 }
1270                 if (control != NULL && !sb_unix) {
1271                         m_freem(control);
1272                 }
1273                 if (error_out != NULL) {
1274                         *error_out = EINVAL;
1275                 }
1276                 return 0;
1277         }
1278
1279         /* Call socket data in filters */
1280         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1281                 int error;
1282                 error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
1283                 SBLASTRECORDCHK(sb, __func__);
1284
1285 #if CONTENT_FILTER
1286                 if (error == 0) {
1287                         error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
1288                             0);
1289                 }
1290 #endif /* CONTENT_FILTER */
1291
1292                 if (error) {
1293                         if (error != EJUSTRETURN) {
1294                                 if (m0) {
1295                                         m_freem(m0);
1296                                 }
1297                                 if (control != NULL && !sb_unix) {
1298                                         m_freem(control);
1299                                 }
1300                                 if (error_out) {
1301                                         *error_out = error;
1302                                 }
1303                         }
1304                         return 0;
1305                 }
1306         } else if (m0) {
1307                 m0->m_flags &= ~M_SKIPCFIL;
1308         }
1309
1310         mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
1311         SBLASTRECORDCHK(sb, "sbappendadddr 1");
1312         result = sbappendchain(sb, mbuf_chain, 0);
1313         if (result == 0) {
1314                 if (m0) {
1315                         m_freem(m0);
1316                 }
1317                 if (control != NULL && !sb_unix) {
1318                         m_freem(control);
1319                 }
1320                 if (error_out) {
1321                         *error_out = ENOBUFS;
1322                 }
1323         }
1324
1325         return result;
1326 }
1327
1328 inline boolean_t
1329 is_cmsg_valid(struct mbuf *control, struct cmsghdr *cmsg)
1330 {
1331         if (cmsg == NULL) {
1332                 return FALSE;
1333         }
1334
1335         if (cmsg->cmsg_len < sizeof(struct cmsghdr)) {
1336                 return FALSE;
1337         }
1338
1339         if ((uint8_t *)control->m_data >= (uint8_t *)cmsg + cmsg->cmsg_len) {
1340                 return FALSE;
1341         }
1342
1343         if ((uint8_t *)control->m_data + control->m_len <
1344             (uint8_t *)cmsg + cmsg->cmsg_len) {
1345                 return FALSE;
1346         }
1347
1348         return TRUE;
1349 }
1350
1351 static int
1352 sbappendcontrol_internal(struct sockbuf *sb, struct mbuf *m0,
1353     struct mbuf *control)
1354 {
1355         struct mbuf *m, *mlast, *n;
1356         int space = 0;
1357
1358         if (control == 0) {
1359                 panic("sbappendcontrol");
1360         }
1361
1362         for (m = control;; m = m->m_next) {
1363                 space += m->m_len;
1364                 if (m->m_next == 0) {
1365                         break;
1366                 }
1367         }
1368         n = m;                  /* save pointer to last control buffer */
1369         for (m = m0; m; m = m->m_next) {
1370                 space += m->m_len;
1371         }
1372         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1373                 return 0;
1374         }
1375         n->m_next = m0;                 /* concatenate data to control */
1376         SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1377
1378         for (m = control; m->m_next != NULL; m = m->m_next) {
1379                 sballoc(sb, m);
1380         }
1381         sballoc(sb, m);
1382         mlast = m;
1383
1384         if (sb->sb_lastrecord != NULL) {
1385                 sb->sb_lastrecord->m_nextpkt = control;
1386         } else {
1387                 sb->sb_mb = control;
1388         }
1389         sb->sb_lastrecord = control;
1390         sb->sb_mbtail = mlast;
1391
1392         SBLASTMBUFCHK(sb, __func__);
1393         SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1394
1395         postevent(0, sb, EV_RWBYTES);
1396         return 1;
1397 }
1398
1399 int
1400 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
1401     int *error_out)
1402 {
1403         int result = 0;
1404         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1405
1406         if (error_out) {
1407                 *error_out = 0;
1408         }
1409
1410         if (sb->sb_flags & SB_DROP) {
1411                 if (m0 != NULL) {
1412                         m_freem(m0);
1413                 }
1414                 if (control != NULL && !sb_unix) {
1415                         m_freem(control);
1416                 }
1417                 if (error_out != NULL) {
1418                         *error_out = EINVAL;
1419                 }
1420                 return 0;
1421         }
1422
1423         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1424                 int error;
1425
1426                 error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
1427                 SBLASTRECORDCHK(sb, __func__);
1428
1429 #if CONTENT_FILTER
1430                 if (error == 0) {
1431                         error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
1432                             0);
1433                 }
1434 #endif /* CONTENT_FILTER */
1435
1436                 if (error) {
1437                         if (error != EJUSTRETURN) {
1438                                 if (m0) {
1439                                         m_freem(m0);
1440                                 }
1441                                 if (control != NULL && !sb_unix) {
1442                                         m_freem(control);
1443                                 }
1444                                 if (error_out) {
1445                                         *error_out = error;
1446                                 }
1447                         }
1448                         return 0;
1449                 }
1450         } else if (m0) {
1451                 m0->m_flags &= ~M_SKIPCFIL;
1452         }
1453
1454         result = sbappendcontrol_internal(sb, m0, control);
1455         if (result == 0) {
1456                 if (m0) {
1457                         m_freem(m0);
1458                 }
1459                 if (control != NULL && !sb_unix) {
1460                         m_freem(control);
1461                 }
1462                 if (error_out) {
1463                         *error_out = ENOBUFS;
1464                 }
1465         }
1466
1467         return result;
1468 }
1469
1470 /*
1471  * Append a contiguous TCP data blob with TCP sequence number as control data
1472  * as a new msg to the receive socket buffer.
1473  */
1474 int
1475 sbappendmsgstream_rcv(struct sockbuf *sb, struct mbuf *m, uint32_t seqnum,
1476     int unordered)
1477 {
1478         struct mbuf *m_eor = NULL;
1479         u_int32_t data_len = 0;
1480         int ret = 0;
1481         struct socket *so = sb->sb_so;
1482
1483         if (m == NULL) {
1484                 return 0;
1485         }
1486
1487         VERIFY((m->m_flags & M_PKTHDR) && m_pktlen(m) > 0);
1488         VERIFY(so->so_msg_state != NULL);
1489         VERIFY(sb->sb_flags & SB_RECV);
1490
1491         /* Keep the TCP sequence number in the mbuf pkthdr */
1492         m->m_pkthdr.msg_seq = seqnum;
1493
1494         /* find last mbuf and set M_EOR */
1495         for (m_eor = m;; m_eor = m_eor->m_next) {
1496                 /*
1497                  * If the msg is unordered, we need to account for
1498                  * these bytes in receive socket buffer size. Otherwise,
1499                  * the receive window advertised will shrink because
1500                  * of the additional unordered bytes added to the
1501                  * receive buffer.
1502                  */
1503                 if (unordered) {
1504                         m_eor->m_flags |= M_UNORDERED_DATA;
1505                         data_len += m_eor->m_len;
1506                         so->so_msg_state->msg_uno_bytes += m_eor->m_len;
1507                 } else {
1508                         m_eor->m_flags &= ~M_UNORDERED_DATA;
1509                 }
1510                 if (m_eor->m_next == NULL) {
1511                         break;
1512                 }
1513         }
1514
1515         /* set EOR flag at end of byte blob */
1516         m_eor->m_flags |= M_EOR;
1517
1518         /* expand the receive socket buffer to allow unordered data */
1519         if (unordered && !sbreserve(sb, sb->sb_hiwat + data_len)) {
1520                 /*
1521                  * Could not allocate memory for unordered data, it
1522                  * means this packet will have to be delivered in order
1523                  */
1524                 printf("%s: could not reserve space for unordered data\n",
1525                     __func__);
1526         }
1527
1528         if (!unordered && (sb->sb_mbtail != NULL) &&
1529             !(sb->sb_mbtail->m_flags & M_UNORDERED_DATA)) {
1530                 sb->sb_mbtail->m_flags &= ~M_EOR;
1531                 sbcompress(sb, m, sb->sb_mbtail);
1532                 ret = 1;
1533         } else {
1534                 ret = sbappendrecord(sb, m);
1535         }
1536         VERIFY(sb->sb_mbtail->m_flags & M_EOR);
1537         return ret;
1538 }
1539
1540 /*
1541  * TCP streams have message based out of order delivery support, or have
1542  * Multipath TCP support, or are regular TCP sockets
1543  */
1544 int
1545 sbappendstream_rcvdemux(struct socket *so, struct mbuf *m, uint32_t seqnum,
1546     int unordered)
1547 {
1548         int ret = 0;
1549
1550         if ((m != NULL) &&
1551             m_pktlen(m) <= 0 &&
1552             !((so->so_flags & SOF_MP_SUBFLOW) &&
1553             (m->m_flags & M_PKTHDR) &&
1554             (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1555                 m_freem(m);
1556                 return ret;
1557         }
1558
1559         if (so->so_flags & SOF_ENABLE_MSGS) {
1560                 ret = sbappendmsgstream_rcv(&so->so_rcv, m, seqnum, unordered);
1561         }
1562 #if MPTCP
1563         else if (so->so_flags & SOF_MP_SUBFLOW) {
1564                 ret = sbappendmptcpstream_rcv(&so->so_rcv, m);
1565         }
1566 #endif /* MPTCP */
1567         else {
1568                 ret = sbappendstream(&so->so_rcv, m);
1569         }
1570         return ret;
1571 }
1572
1573 #if MPTCP
1574 int
1575 sbappendmptcpstream_rcv(struct sockbuf *sb, struct mbuf *m)
1576 {
1577         struct socket *so = sb->sb_so;
1578
1579         VERIFY(m == NULL || (m->m_flags & M_PKTHDR));
1580         /* SB_NOCOMPRESS must be set prevent loss of M_PKTHDR data */
1581         VERIFY((sb->sb_flags & (SB_RECV | SB_NOCOMPRESS)) ==
1582             (SB_RECV | SB_NOCOMPRESS));
1583
1584         if (m == NULL || m_pktlen(m) == 0 || (sb->sb_flags & SB_DROP) ||
1585             (so->so_state & SS_CANTRCVMORE)) {
1586                 if (m && (m->m_flags & M_PKTHDR) &&
1587                     m_pktlen(m) == 0 &&
1588                     (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
1589                         mptcp_input(tptomptp(sototcpcb(so))->mpt_mpte, m);
1590                         return 1;
1591                 } else if (m != NULL) {
1592                         m_freem(m);
1593                 }
1594                 return 0;
1595         }
1596         /* the socket is not closed, so SOF_MP_SUBFLOW must be set */
1597         VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1598
1599         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1600                 panic("%s: nexpkt %p || mb %p != lastrecord %p\n", __func__,
1601                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1602                 /* NOTREACHED */
1603         }
1604
1605         SBLASTMBUFCHK(sb, __func__);
1606
1607         /* No filter support (SB_RECV) on mptcp subflow sockets */
1608
1609         sbcompress(sb, m, sb->sb_mbtail);
1610         sb->sb_lastrecord = sb->sb_mb;
1611         SBLASTRECORDCHK(sb, __func__);
1612         return 1;
1613 }
1614 #endif /* MPTCP */
1615
1616 /*
1617  * Append message to send socket buffer based on priority.
1618  */
1619 int
1620 sbappendmsg_snd(struct sockbuf *sb, struct mbuf *m)
1621 {
1622         struct socket *so = sb->sb_so;
1623         struct msg_priq *priq;
1624         int set_eor = 0;
1625
1626         VERIFY(so->so_msg_state != NULL);
1627
1628         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1629                 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
1630                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1631         }
1632
1633         SBLASTMBUFCHK(sb, __func__);
1634
1635         if (m == NULL || (sb->sb_flags & SB_DROP) || so->so_msg_state == NULL) {
1636                 if (m != NULL) {
1637                         m_freem(m);
1638                 }
1639                 return 0;
1640         }
1641
1642         priq = &so->so_msg_state->msg_priq[m->m_pkthdr.msg_pri];
1643
1644         /* note if we need to propogate M_EOR to the last mbuf */
1645         if (m->m_flags & M_EOR) {
1646                 set_eor = 1;
1647
1648                 /* Reset M_EOR from the first mbuf */
1649                 m->m_flags &= ~(M_EOR);
1650         }
1651
1652         if (priq->msgq_head == NULL) {
1653                 VERIFY(priq->msgq_tail == NULL && priq->msgq_lastmsg == NULL);
1654                 priq->msgq_head = priq->msgq_lastmsg = m;
1655         } else {
1656                 VERIFY(priq->msgq_tail->m_next == NULL);
1657
1658                 /* Check if the last message has M_EOR flag set */
1659                 if (priq->msgq_tail->m_flags & M_EOR) {
1660                         /* Insert as a new message */
1661                         priq->msgq_lastmsg->m_nextpkt = m;
1662
1663                         /* move the lastmsg pointer */
1664                         priq->msgq_lastmsg = m;
1665                 } else {
1666                         /* Append to the existing message */
1667                         priq->msgq_tail->m_next = m;
1668                 }
1669         }
1670
1671         /* Update accounting and the queue tail pointer */
1672
1673         while (m->m_next != NULL) {
1674                 sballoc(sb, m);
1675                 priq->msgq_bytes += m->m_len;
1676                 m = m->m_next;
1677         }
1678         sballoc(sb, m);
1679         priq->msgq_bytes += m->m_len;
1680
1681         if (set_eor) {
1682                 m->m_flags |= M_EOR;
1683
1684                 /*
1685                  * Since the user space can not write a new msg
1686                  * without completing the previous one, we can
1687                  * reset this flag to start sending again.
1688                  */
1689                 priq->msgq_flags &= ~(MSGQ_MSG_NOTDONE);
1690         }
1691
1692         priq->msgq_tail = m;
1693
1694         SBLASTRECORDCHK(sb, "sbappendstream 2");
1695         postevent(0, sb, EV_RWBYTES);
1696         return 1;
1697 }
1698
1699 /*
1700  * Pull data from priority queues to the serial snd queue
1701  * right before sending.
1702  */
1703 void
1704 sbpull_unordered_data(struct socket *so, int32_t off, int32_t len)
1705 {
1706         int32_t topull, i;
1707         struct msg_priq *priq = NULL;
1708
1709         VERIFY(so->so_msg_state != NULL);
1710
1711         topull = (off + len) - so->so_msg_state->msg_serial_bytes;
1712
1713         i = MSG_PRI_MAX;
1714         while (i >= MSG_PRI_MIN && topull > 0) {
1715                 struct mbuf *m = NULL, *mqhead = NULL, *mend = NULL;
1716                 priq = &so->so_msg_state->msg_priq[i];
1717                 if ((priq->msgq_flags & MSGQ_MSG_NOTDONE) &&
1718                     priq->msgq_head == NULL) {
1719                         /*
1720                          * We were in the middle of sending
1721                          * a message and we have not seen the
1722                          * end of it.
1723                          */
1724                         VERIFY(priq->msgq_lastmsg == NULL &&
1725                             priq->msgq_tail == NULL);
1726                         return;
1727                 }
1728                 if (priq->msgq_head != NULL) {
1729                         int32_t bytes = 0, topull_tmp = topull;
1730                         /*
1731                          * We found a msg while scanning the priority
1732                          * queue from high to low priority.
1733                          */
1734                         m = priq->msgq_head;
1735                         mqhead = m;
1736                         mend = m;
1737
1738                         /*
1739                          * Move bytes from the priority queue to the
1740                          * serial queue. Compute the number of bytes
1741                          * being added.
1742                          */
1743                         while (mqhead->m_next != NULL && topull_tmp > 0) {
1744                                 bytes += mqhead->m_len;
1745                                 topull_tmp -= mqhead->m_len;
1746                                 mend = mqhead;
1747                                 mqhead = mqhead->m_next;
1748                         }
1749
1750                         if (mqhead->m_next == NULL) {
1751                                 /*
1752                                  * If we have only one more mbuf left,
1753                                  * move the last mbuf of this message to
1754                                  * serial queue and set the head of the
1755                                  * queue to be the next message.
1756                                  */
1757                                 bytes += mqhead->m_len;
1758                                 mend = mqhead;
1759                                 mqhead = m->m_nextpkt;
1760                                 if (!(mend->m_flags & M_EOR)) {
1761                                         /*
1762                                          * We have not seen the end of
1763                                          * this message, so we can not
1764                                          * pull anymore.
1765                                          */
1766                                         priq->msgq_flags |= MSGQ_MSG_NOTDONE;
1767                                 } else {
1768                                         /* Reset M_EOR */
1769                                         mend->m_flags &= ~(M_EOR);
1770                                 }
1771                         } else {
1772                                 /* propogate the next msg pointer */
1773                                 mqhead->m_nextpkt = m->m_nextpkt;
1774                         }
1775                         priq->msgq_head = mqhead;
1776
1777                         /*
1778                          * if the lastmsg pointer points to
1779                          * the mbuf that is being dequeued, update
1780                          * it to point to the new head.
1781                          */
1782                         if (priq->msgq_lastmsg == m) {
1783                                 priq->msgq_lastmsg = priq->msgq_head;
1784                         }
1785
1786                         m->m_nextpkt = NULL;
1787                         mend->m_next = NULL;
1788
1789                         if (priq->msgq_head == NULL) {
1790                                 /* Moved all messages, update tail */
1791                                 priq->msgq_tail = NULL;
1792                                 VERIFY(priq->msgq_lastmsg == NULL);
1793                         }
1794
1795                         /* Move it to serial sb_mb queue */
1796                         if (so->so_snd.sb_mb == NULL) {
1797                                 so->so_snd.sb_mb = m;
1798                         } else {
1799                                 so->so_snd.sb_mbtail->m_next = m;
1800                         }
1801
1802                         priq->msgq_bytes -= bytes;
1803                         VERIFY(priq->msgq_bytes >= 0);
1804                         sbwakeup(&so->so_snd);
1805
1806                         so->so_msg_state->msg_serial_bytes += bytes;
1807                         so->so_snd.sb_mbtail = mend;
1808                         so->so_snd.sb_lastrecord = so->so_snd.sb_mb;
1809
1810                         topull =
1811                             (off + len) - so->so_msg_state->msg_serial_bytes;
1812
1813                         if (priq->msgq_flags & MSGQ_MSG_NOTDONE) {
1814                                 break;
1815                         }
1816                 } else {
1817                         --i;
1818                 }
1819         }
1820         sblastrecordchk(&so->so_snd, "sbpull_unordered_data");
1821         sblastmbufchk(&so->so_snd, "sbpull_unordered_data");
1822 }
1823
1824 /*
1825  * Compress mbuf chain m into the socket
1826  * buffer sb following mbuf n.  If n
1827  * is null, the buffer is presumed empty.
1828  */
1829 static inline void
1830 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1831 {
1832         int eor = 0, compress = (!(sb->sb_flags & SB_NOCOMPRESS));
1833         struct mbuf *o;
1834
1835         if (m == NULL) {
1836                 /* There is nothing to compress; just update the tail */
1837                 for (; n->m_next != NULL; n = n->m_next) {
1838                         ;
1839                 }
1840                 sb->sb_mbtail = n;
1841                 goto done;
1842         }
1843
1844         while (m != NULL) {
1845                 eor |= m->m_flags & M_EOR;
1846                 if (compress && m->m_len == 0 && (eor == 0 ||
1847                     (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) {
1848                         if (sb->sb_lastrecord == m) {
1849                                 sb->sb_lastrecord = m->m_next;
1850                         }
1851                         m = m_free(m);
1852                         continue;
1853                 }
1854                 if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
1855 #ifndef __APPLE__
1856                     M_WRITABLE(n) &&
1857 #endif
1858                     m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1859                     m->m_len <= M_TRAILINGSPACE(n) &&
1860                     n->m_type == m->m_type) {
1861                         bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1862                             (unsigned)m->m_len);
1863                         n->m_len += m->m_len;
1864                         sb->sb_cc += m->m_len;
1865                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1866                             m->m_type != MT_OOBDATA) {
1867                                 /* XXX: Probably don't need */
1868                                 sb->sb_ctl += m->m_len;
1869                         }
1870
1871                         /* update send byte count */
1872                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
1873                                 inp_incr_sndbytes_total(sb->sb_so,
1874                                     m->m_len);
1875                                 inp_incr_sndbytes_unsent(sb->sb_so,
1876                                     m->m_len);
1877                         }
1878                         m = m_free(m);
1879                         continue;
1880                 }
1881                 if (n != NULL) {
1882                         n->m_next = m;
1883                 } else {
1884                         sb->sb_mb = m;
1885                 }
1886                 sb->sb_mbtail = m;
1887                 sballoc(sb, m);
1888                 n = m;
1889                 m->m_flags &= ~M_EOR;
1890                 m = m->m_next;
1891                 n->m_next = NULL;
1892         }
1893         if (eor != 0) {
1894                 if (n != NULL) {
1895                         n->m_flags |= eor;
1896                 } else {
1897                         printf("semi-panic: sbcompress\n");
1898                 }
1899         }
1900 done:
1901         SBLASTMBUFCHK(sb, __func__);
1902         postevent(0, sb, EV_RWBYTES);
1903 }
1904
1905 void
1906 sb_empty_assert(struct sockbuf *sb, const char *where)
1907 {
1908         if (!(sb->sb_cc == 0 && sb->sb_mb == NULL && sb->sb_mbcnt == 0 &&
1909             sb->sb_mbtail == NULL && sb->sb_lastrecord == NULL)) {
1910                 panic("%s: sb %p so %p cc %d mbcnt %d mb %p mbtail %p "
1911                     "lastrecord %p\n", where, sb, sb->sb_so, sb->sb_cc,
1912                     sb->sb_mbcnt, sb->sb_mb, sb->sb_mbtail,
1913                     sb->sb_lastrecord);
1914                 /* NOTREACHED */
1915         }
1916 }
1917
1918 static void
1919 sbflush_priq(struct msg_priq *priq)
1920 {
1921         struct mbuf *m;
1922         m = priq->msgq_head;
1923         if (m != NULL) {
1924                 m_freem_list(m);
1925         }
1926         priq->msgq_head = priq->msgq_tail = priq->msgq_lastmsg = NULL;
1927         priq->msgq_bytes = priq->msgq_flags = 0;
1928 }
1929
1930 /*
1931  * Free all mbufs in a sockbuf.
1932  * Check that all resources are reclaimed.
1933  */
1934 void
1935 sbflush(struct sockbuf *sb)
1936 {
1937         void *lr_saved = __builtin_return_address(0);
1938         struct socket *so = sb->sb_so;
1939         u_int32_t i;
1940
1941         /* so_usecount may be 0 if we get here from sofreelastref() */
1942         if (so == NULL) {
1943                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
1944                     __func__, sb, sb->sb_flags, lr_saved);
1945                 /* NOTREACHED */
1946         } else if (so->so_usecount < 0) {
1947                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
1948                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
1949                     so->so_usecount, lr_saved, solockhistory_nr(so));
1950                 /* NOTREACHED */
1951         }
1952
1953         /*
1954          * Obtain lock on the socket buffer (SB_LOCK).  This is required
1955          * to prevent the socket buffer from being unexpectedly altered
1956          * while it is used by another thread in socket send/receive.
1957          *
1958          * sblock() must not fail here, hence the assertion.
1959          */
1960         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
1961         VERIFY(sb->sb_flags & SB_LOCK);
1962
1963         while (sb->sb_mbcnt > 0) {
1964                 /*
1965                  * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1966                  * we would loop forever. Panic instead.
1967                  */
1968                 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) {
1969                         break;
1970                 }
1971                 sbdrop(sb, (int)sb->sb_cc);
1972         }
1973
1974         if (!(sb->sb_flags & SB_RECV) && (so->so_flags & SOF_ENABLE_MSGS)) {
1975                 VERIFY(so->so_msg_state != NULL);
1976                 for (i = MSG_PRI_MIN; i <= MSG_PRI_MAX; ++i) {
1977                         sbflush_priq(&so->so_msg_state->msg_priq[i]);
1978                 }
1979                 so->so_msg_state->msg_serial_bytes = 0;
1980                 so->so_msg_state->msg_uno_bytes = 0;
1981         }
1982
1983         sb_empty_assert(sb, __func__);
1984         postevent(0, sb, EV_RWBYTES);
1985
1986         sbunlock(sb, TRUE);     /* keep socket locked */
1987 }
1988
1989 /*
1990  * Drop data from (the front of) a sockbuf.
1991  * use m_freem_list to free the mbuf structures
1992  * under a single lock... this is done by pruning
1993  * the top of the tree from the body by keeping track
1994  * of where we get to in the tree and then zeroing the
1995  * two pertinent pointers m_nextpkt and m_next
1996  * the socket buffer is then updated to point at the new
1997  * top of the tree and the pruned area is released via
1998  * m_freem_list.
1999  */
2000 void
2001 sbdrop(struct sockbuf *sb, int len)
2002 {
2003         struct mbuf *m, *free_list, *ml;
2004         struct mbuf *next, *last;
2005
2006         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
2007 #if MPTCP
2008         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
2009             ((sb->sb_so->so_flags & SOF_MP_SUBFLOW) ||
2010             (SOCK_CHECK_DOM(sb->sb_so, PF_MULTIPATH) &&
2011             SOCK_CHECK_PROTO(sb->sb_so, IPPROTO_TCP))) &&
2012             !(sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
2013                 mptcp_preproc_sbdrop(sb->sb_so, m, (unsigned int)len);
2014         }
2015         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
2016             (sb->sb_so->so_flags & SOF_MP_SUBFLOW) &&
2017             (sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
2018                 mptcp_fallback_sbdrop(sb->sb_so, m, len);
2019         }
2020 #endif /* MPTCP */
2021         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
2022
2023         free_list = last = m;
2024         ml = (struct mbuf *)0;
2025
2026         while (len > 0) {
2027                 if (m == NULL) {
2028                         if (next == NULL) {
2029                                 /*
2030                                  * temporarily replacing this panic with printf
2031                                  * because it occurs occasionally when closing
2032                                  * a socket when there is no harm in ignoring
2033                                  * it. This problem will be investigated
2034                                  * further.
2035                                  */
2036                                 /* panic("sbdrop"); */
2037                                 printf("sbdrop - count not zero\n");
2038                                 len = 0;
2039                                 /*
2040                                  * zero the counts. if we have no mbufs,
2041                                  * we have no data (PR-2986815)
2042                                  */
2043                                 sb->sb_cc = 0;
2044                                 sb->sb_mbcnt = 0;
2045                                 if (!(sb->sb_flags & SB_RECV) &&
2046                                     (sb->sb_so->so_flags & SOF_ENABLE_MSGS)) {
2047                                         sb->sb_so->so_msg_state->
2048                                         msg_serial_bytes = 0;
2049                                 }
2050                                 break;
2051                         }
2052                         m = last = next;
2053                         next = m->m_nextpkt;
2054                         continue;
2055                 }
2056                 if (m->m_len > len) {
2057                         m->m_len -= len;
2058                         m->m_data += len;
2059                         sb->sb_cc -= len;
2060                         /* update the send byte count */
2061                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2062                                 inp_decr_sndbytes_total(sb->sb_so, len);
2063                         }
2064                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2065                             m->m_type != MT_OOBDATA) {
2066                                 sb->sb_ctl -= len;
2067                         }
2068                         break;
2069                 }
2070                 len -= m->m_len;
2071                 sbfree(sb, m);
2072
2073                 ml = m;
2074                 m = m->m_next;
2075         }
2076         while (m && m->m_len == 0) {
2077                 sbfree(sb, m);
2078
2079                 ml = m;
2080                 m = m->m_next;
2081         }
2082         if (ml) {
2083                 ml->m_next = (struct mbuf *)0;
2084                 last->m_nextpkt = (struct mbuf *)0;
2085                 m_freem_list(free_list);
2086         }
2087         if (m) {
2088                 sb->sb_mb = m;
2089                 m->m_nextpkt = next;
2090         } else {
2091                 sb->sb_mb = next;
2092         }
2093
2094         /*
2095          * First part is an inline SB_EMPTY_FIXUP().  Second part
2096          * makes sure sb_lastrecord is up-to-date if we dropped
2097          * part of the last record.
2098          */
2099         m = sb->sb_mb;
2100         if (m == NULL) {
2101                 sb->sb_mbtail = NULL;
2102                 sb->sb_lastrecord = NULL;
2103         } else if (m->m_nextpkt == NULL) {
2104                 sb->sb_lastrecord = m;
2105         }
2106
2107 #if CONTENT_FILTER
2108         cfil_sock_buf_update(sb);
2109 #endif /* CONTENT_FILTER */
2110
2111         postevent(0, sb, EV_RWBYTES);
2112
2113         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
2114 }
2115
2116 /*
2117  * Drop a record off the front of a sockbuf
2118  * and move the next record to the front.
2119  */
2120 void
2121 sbdroprecord(struct sockbuf *sb)
2122 {
2123         struct mbuf *m, *mn;
2124
2125         m = sb->sb_mb;
2126         if (m) {
2127                 sb->sb_mb = m->m_nextpkt;
2128                 do {
2129                         sbfree(sb, m);
2130                         MFREE(m, mn);
2131                         m = mn;
2132                 } while (m);
2133         }
2134         SB_EMPTY_FIXUP(sb);
2135         postevent(0, sb, EV_RWBYTES);
2136 }
2137
2138 /*
2139  * Create a "control" mbuf containing the specified data
2140  * with the specified type for presentation on a socket buffer.
2141  */
2142 struct mbuf *
2143 sbcreatecontrol(caddr_t p, int size, int type, int level)
2144 {
2145         struct cmsghdr *cp;
2146         struct mbuf *m;
2147
2148         if (CMSG_SPACE((u_int)size) > MLEN) {
2149                 return (struct mbuf *)NULL;
2150         }
2151         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) {
2152                 return (struct mbuf *)NULL;
2153         }
2154         cp = mtod(m, struct cmsghdr *);
2155         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
2156         /* XXX check size? */
2157         (void) memcpy(CMSG_DATA(cp), p, size);
2158         m->m_len = CMSG_SPACE(size);
2159         cp->cmsg_len = CMSG_LEN(size);
2160         cp->cmsg_level = level;
2161         cp->cmsg_type = type;
2162         return m;
2163 }
2164
2165 struct mbuf **
2166 sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf **mp)
2167 {
2168         struct mbuf *m;
2169         struct cmsghdr *cp;
2170
2171         if (*mp == NULL) {
2172                 *mp = sbcreatecontrol(p, size, type, level);
2173                 return mp;
2174         }
2175
2176         if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN) {
2177                 mp = &(*mp)->m_next;
2178                 *mp = sbcreatecontrol(p, size, type, level);
2179                 return mp;
2180         }
2181
2182         m = *mp;
2183
2184         cp = (struct cmsghdr *)(void *)(mtod(m, char *) + m->m_len);
2185         /* CMSG_SPACE ensures 32-bit alignment */
2186         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
2187         m->m_len += CMSG_SPACE(size);
2188
2189         /* XXX check size? */
2190         (void) memcpy(CMSG_DATA(cp), p, size);
2191         cp->cmsg_len = CMSG_LEN(size);
2192         cp->cmsg_level = level;
2193         cp->cmsg_type = type;
2194
2195         return mp;
2196 }
2197
2198
2199 /*
2200  * Some routines that return EOPNOTSUPP for entry points that are not
2201  * supported by a protocol.  Fill in as needed.
2202  */
2203 int
2204 pru_abort_notsupp(struct socket *so)
2205 {
2206 #pragma unused(so)
2207         return EOPNOTSUPP;
2208 }
2209
2210 int
2211 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2212 {
2213 #pragma unused(so, nam)
2214         return EOPNOTSUPP;
2215 }
2216
2217 int
2218 pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
2219 {
2220 #pragma unused(so, proto, p)
2221         return EOPNOTSUPP;
2222 }
2223
2224 int
2225 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2226 {
2227 #pragma unused(so, nam, p)
2228         return EOPNOTSUPP;
2229 }
2230
2231 int
2232 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2233 {
2234 #pragma unused(so, nam, p)
2235         return EOPNOTSUPP;
2236 }
2237
2238 int
2239 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2240 {
2241 #pragma unused(so1, so2)
2242         return EOPNOTSUPP;
2243 }
2244
2245 int
2246 pru_connectx_notsupp(struct socket *so, struct sockaddr *src,
2247     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
2248     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
2249     uint32_t arglen, struct uio *uio, user_ssize_t *bytes_written)
2250 {
2251 #pragma unused(so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written)
2252         return EOPNOTSUPP;
2253 }
2254
2255 int
2256 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2257     struct ifnet *ifp, struct proc *p)
2258 {
2259 #pragma unused(so, cmd, data, ifp, p)
2260         return EOPNOTSUPP;
2261 }
2262
2263 int
2264 pru_detach_notsupp(struct socket *so)
2265 {
2266 #pragma unused(so)
2267         return EOPNOTSUPP;
2268 }
2269
2270 int
2271 pru_disconnect_notsupp(struct socket *so)
2272 {
2273 #pragma unused(so)
2274         return EOPNOTSUPP;
2275 }
2276
2277 int
2278 pru_disconnectx_notsupp(struct socket *so, sae_associd_t aid, sae_connid_t cid)
2279 {
2280 #pragma unused(so, aid, cid)
2281         return EOPNOTSUPP;
2282 }
2283
2284 int
2285 pru_listen_notsupp(struct socket *so, struct proc *p)
2286 {
2287 #pragma unused(so, p)
2288         return EOPNOTSUPP;
2289 }
2290
2291 int
2292 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2293 {
2294 #pragma unused(so, nam)
2295         return EOPNOTSUPP;
2296 }
2297
2298 int
2299 pru_rcvd_notsupp(struct socket *so, int flags)
2300 {
2301 #pragma unused(so, flags)
2302         return EOPNOTSUPP;
2303 }
2304
2305 int
2306 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2307 {
2308 #pragma unused(so, m, flags)
2309         return EOPNOTSUPP;
2310 }
2311
2312 int
2313 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2314     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2315 {
2316 #pragma unused(so, flags, m, addr, control, p)
2317         return EOPNOTSUPP;
2318 }
2319
2320 int
2321 pru_send_list_notsupp(struct socket *so, int flags, struct mbuf *m,
2322     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2323 {
2324 #pragma unused(so, flags, m, addr, control, p)
2325         return EOPNOTSUPP;
2326 }
2327
2328 /*
2329  * This isn't really a ``null'' operation, but it's the default one
2330  * and doesn't do anything destructive.
2331  */
2332 int
2333 pru_sense_null(struct socket *so, void *ub, int isstat64)
2334 {
2335         if (isstat64 != 0) {
2336                 struct stat64 *sb64;
2337
2338                 sb64 = (struct stat64 *)ub;
2339                 sb64->st_blksize = so->so_snd.sb_hiwat;
2340         } else {
2341                 struct stat *sb;
2342
2343                 sb = (struct stat *)ub;
2344                 sb->st_blksize = so->so_snd.sb_hiwat;
2345         }
2346
2347         return 0;
2348 }
2349
2350
2351 int
2352 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2353     struct mbuf *top, struct mbuf *control, int flags)
2354 {
2355 #pragma unused(so, addr, uio, top, control, flags)
2356         return EOPNOTSUPP;
2357 }
2358
2359 int
2360 pru_sosend_list_notsupp(struct socket *so, struct uio **uio,
2361     u_int uiocnt, int flags)
2362 {
2363 #pragma unused(so, uio, uiocnt, flags)
2364         return EOPNOTSUPP;
2365 }
2366
2367 int
2368 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2369     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2370 {
2371 #pragma unused(so, paddr, uio, mp0, controlp, flagsp)
2372         return EOPNOTSUPP;
2373 }
2374
2375 int
2376 pru_soreceive_list_notsupp(struct socket *so,
2377     struct recv_msg_elem *recv_msg_array, u_int uiocnt, int *flagsp)
2378 {
2379 #pragma unused(so, recv_msg_array, uiocnt, flagsp)
2380         return EOPNOTSUPP;
2381 }
2382
2383 int
2384 pru_shutdown_notsupp(struct socket *so)
2385 {
2386 #pragma unused(so)
2387         return EOPNOTSUPP;
2388 }
2389
2390 int
2391 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2392 {
2393 #pragma unused(so, nam)
2394         return EOPNOTSUPP;
2395 }
2396
2397 int
2398 pru_sopoll_notsupp(struct socket *so, int events, kauth_cred_t cred, void *wql)
2399 {
2400 #pragma unused(so, events, cred, wql)
2401         return EOPNOTSUPP;
2402 }
2403
2404 int
2405 pru_socheckopt_null(struct socket *so, struct sockopt *sopt)
2406 {
2407 #pragma unused(so, sopt)
2408         /*
2409          * Allow all options for set/get by default.
2410          */
2411         return 0;
2412 }
2413
2414 static int
2415 pru_preconnect_null(struct socket *so)
2416 {
2417 #pragma unused(so)
2418         return 0;
2419 }
2420
2421 void
2422 pru_sanitize(struct pr_usrreqs *pru)
2423 {
2424 #define DEFAULT(foo, bar)       if ((foo) == NULL) (foo) = (bar)
2425         DEFAULT(pru->pru_abort, pru_abort_notsupp);
2426         DEFAULT(pru->pru_accept, pru_accept_notsupp);
2427         DEFAULT(pru->pru_attach, pru_attach_notsupp);
2428         DEFAULT(pru->pru_bind, pru_bind_notsupp);
2429         DEFAULT(pru->pru_connect, pru_connect_notsupp);
2430         DEFAULT(pru->pru_connect2, pru_connect2_notsupp);
2431         DEFAULT(pru->pru_connectx, pru_connectx_notsupp);
2432         DEFAULT(pru->pru_control, pru_control_notsupp);
2433         DEFAULT(pru->pru_detach, pru_detach_notsupp);
2434         DEFAULT(pru->pru_disconnect, pru_disconnect_notsupp);
2435         DEFAULT(pru->pru_disconnectx, pru_disconnectx_notsupp);
2436         DEFAULT(pru->pru_listen, pru_listen_notsupp);
2437         DEFAULT(pru->pru_peeraddr, pru_peeraddr_notsupp);
2438         DEFAULT(pru->pru_rcvd, pru_rcvd_notsupp);
2439         DEFAULT(pru->pru_rcvoob, pru_rcvoob_notsupp);
2440         DEFAULT(pru->pru_send, pru_send_notsupp);
2441         DEFAULT(pru->pru_send_list, pru_send_list_notsupp);
2442         DEFAULT(pru->pru_sense, pru_sense_null);
2443         DEFAULT(pru->pru_shutdown, pru_shutdown_notsupp);
2444         DEFAULT(pru->pru_sockaddr, pru_sockaddr_notsupp);
2445         DEFAULT(pru->pru_sopoll, pru_sopoll_notsupp);
2446         DEFAULT(pru->pru_soreceive, pru_soreceive_notsupp);
2447         DEFAULT(pru->pru_soreceive_list, pru_soreceive_list_notsupp);
2448         DEFAULT(pru->pru_sosend, pru_sosend_notsupp);
2449         DEFAULT(pru->pru_sosend_list, pru_sosend_list_notsupp);
2450         DEFAULT(pru->pru_socheckopt, pru_socheckopt_null);
2451         DEFAULT(pru->pru_preconnect, pru_preconnect_null);
2452 #undef DEFAULT
2453 }
2454
2455 /*
2456  * The following are macros on BSD and functions on Darwin
2457  */
2458
2459 /*
2460  * Do we need to notify the other side when I/O is possible?
2461  */
2462
2463 int
2464 sb_notify(struct sockbuf *sb)
2465 {
2466         return sb->sb_waiters > 0 ||
2467                (sb->sb_flags & (SB_SEL | SB_ASYNC | SB_UPCALL | SB_KNOTE));
2468 }
2469
2470 /*
2471  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
2472  * This is problematical if the fields are unsigned, as the space might
2473  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
2474  * overflow and return 0.
2475  */
2476 int
2477 sbspace(struct sockbuf *sb)
2478 {
2479         int pending = 0;
2480         int space = imin((int)(sb->sb_hiwat - sb->sb_cc),
2481             (int)(sb->sb_mbmax - sb->sb_mbcnt));
2482
2483         if (sb->sb_preconn_hiwat != 0) {
2484                 space = imin((int)(sb->sb_preconn_hiwat - sb->sb_cc), space);
2485         }
2486
2487         if (space < 0) {
2488                 space = 0;
2489         }
2490
2491         /* Compensate for data being processed by content filters */
2492 #if CONTENT_FILTER
2493         pending = cfil_sock_data_space(sb);
2494 #endif /* CONTENT_FILTER */
2495         if (pending > space) {
2496                 space = 0;
2497         } else {
2498                 space -= pending;
2499         }
2500
2501         return space;
2502 }
2503
2504 /*
2505  * If this socket has priority queues, check if there is enough
2506  * space in the priority queue for this msg.
2507  */
2508 int
2509 msgq_sbspace(struct socket *so, struct mbuf *control)
2510 {
2511         int space = 0, error;
2512         u_int32_t msgpri = 0;
2513         VERIFY(so->so_type == SOCK_STREAM &&
2514             SOCK_PROTO(so) == IPPROTO_TCP);
2515         if (control != NULL) {
2516                 error = tcp_get_msg_priority(control, &msgpri);
2517                 if (error) {
2518                         return 0;
2519                 }
2520         } else {
2521                 msgpri = MSG_PRI_0;
2522         }
2523         space = (so->so_snd.sb_idealsize / MSG_PRI_COUNT) -
2524             so->so_msg_state->msg_priq[msgpri].msgq_bytes;
2525         if (space < 0) {
2526                 space = 0;
2527         }
2528         return space;
2529 }
2530
2531 /* do we have to send all at once on a socket? */
2532 int
2533 sosendallatonce(struct socket *so)
2534 {
2535         return so->so_proto->pr_flags & PR_ATOMIC;
2536 }
2537
2538 /* can we read something from so? */
2539 int
2540 soreadable(struct socket *so)
2541 {
2542         return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2543                ((so->so_state & SS_CANTRCVMORE)
2544 #if CONTENT_FILTER
2545                && cfil_sock_data_pending(&so->so_rcv) == 0
2546 #endif /* CONTENT_FILTER */
2547                ) ||
2548                so->so_comp.tqh_first || so->so_error;
2549 }
2550
2551 /* can we write something to so? */
2552
2553 int
2554 sowriteable(struct socket *so)
2555 {
2556         if ((so->so_state & SS_CANTSENDMORE) ||
2557             so->so_error > 0) {
2558                 return 1;
2559         }
2560         if (so_wait_for_if_feedback(so) || !socanwrite(so)) {
2561                 return 0;
2562         }
2563         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2564                 return 1;
2565         }
2566
2567         if (sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat) {
2568                 if (so->so_flags & SOF_NOTSENT_LOWAT) {
2569                         if ((SOCK_DOM(so) == PF_INET6 ||
2570                             SOCK_DOM(so) == PF_INET) &&
2571                             so->so_type == SOCK_STREAM) {
2572                                 return tcp_notsent_lowat_check(so);
2573                         }
2574 #if MPTCP
2575                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
2576                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
2577                                 return mptcp_notsent_lowat_check(so);
2578                         }
2579 #endif
2580                         else {
2581                                 return 1;
2582                         }
2583                 } else {
2584                         return 1;
2585                 }
2586         }
2587         return 0;
2588 }
2589
2590 /* adjust counters in sb reflecting allocation of m */
2591
2592 void
2593 sballoc(struct sockbuf *sb, struct mbuf *m)
2594 {
2595         u_int32_t cnt = 1;
2596         sb->sb_cc += m->m_len;
2597         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2598             m->m_type != MT_OOBDATA) {
2599                 sb->sb_ctl += m->m_len;
2600         }
2601         sb->sb_mbcnt += MSIZE;
2602
2603         if (m->m_flags & M_EXT) {
2604                 sb->sb_mbcnt += m->m_ext.ext_size;
2605                 cnt += (m->m_ext.ext_size >> MSIZESHIFT);
2606         }
2607         OSAddAtomic(cnt, &total_sbmb_cnt);
2608         VERIFY(total_sbmb_cnt > 0);
2609         if (total_sbmb_cnt > total_sbmb_cnt_peak) {
2610                 total_sbmb_cnt_peak = total_sbmb_cnt;
2611         }
2612
2613         /*
2614          * If data is being added to the send socket buffer,
2615          * update the send byte count
2616          */
2617         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2618                 inp_incr_sndbytes_total(sb->sb_so, m->m_len);
2619                 inp_incr_sndbytes_unsent(sb->sb_so, m->m_len);
2620         }
2621 }
2622
2623 /* adjust counters in sb reflecting freeing of m */
2624 void
2625 sbfree(struct sockbuf *sb, struct mbuf *m)
2626 {
2627         int cnt = -1;
2628
2629         sb->sb_cc -= m->m_len;
2630         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2631             m->m_type != MT_OOBDATA) {
2632                 sb->sb_ctl -= m->m_len;
2633         }
2634         sb->sb_mbcnt -= MSIZE;
2635         if (m->m_flags & M_EXT) {
2636                 sb->sb_mbcnt -= m->m_ext.ext_size;
2637                 cnt -= (m->m_ext.ext_size >> MSIZESHIFT);
2638         }
2639         OSAddAtomic(cnt, &total_sbmb_cnt);
2640         VERIFY(total_sbmb_cnt >= 0);
2641         if (total_sbmb_cnt < total_sbmb_cnt_floor) {
2642                 total_sbmb_cnt_floor = total_sbmb_cnt;
2643         }
2644
2645         /*
2646          * If data is being removed from the send socket buffer,
2647          * update the send byte count
2648          */
2649         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2650                 inp_decr_sndbytes_total(sb->sb_so, m->m_len);
2651         }
2652 }
2653
2654 /*
2655  * Set lock on sockbuf sb; sleep if lock is already held.
2656  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
2657  * Returns error without lock if sleep is interrupted.
2658  */
2659 int
2660 sblock(struct sockbuf *sb, uint32_t flags)
2661 {
2662         boolean_t nointr = ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR));
2663         void *lr_saved = __builtin_return_address(0);
2664         struct socket *so = sb->sb_so;
2665         void * wchan;
2666         int error = 0;
2667         thread_t tp = current_thread();
2668
2669         VERIFY((flags & SBL_VALID) == flags);
2670
2671         /* so_usecount may be 0 if we get here from sofreelastref() */
2672         if (so == NULL) {
2673                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2674                     __func__, sb, sb->sb_flags, lr_saved);
2675                 /* NOTREACHED */
2676         } else if (so->so_usecount < 0) {
2677                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2678                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2679                     so->so_usecount, lr_saved, solockhistory_nr(so));
2680                 /* NOTREACHED */
2681         }
2682
2683         /*
2684          * The content filter thread must hold the sockbuf lock
2685          */
2686         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2687                 /*
2688                  * Don't panic if we are defunct because SB_LOCK has
2689                  * been cleared by sodefunct()
2690                  */
2691                 if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK)) {
2692                         panic("%s: SB_LOCK not held for %p\n",
2693                             __func__, sb);
2694                 }
2695
2696                 /* Keep the sockbuf locked */
2697                 return 0;
2698         }
2699
2700         if ((sb->sb_flags & SB_LOCK) && !(flags & SBL_WAIT)) {
2701                 return EWOULDBLOCK;
2702         }
2703         /*
2704          * We may get here from sorflush(), in which case "sb" may not
2705          * point to the real socket buffer.  Use the actual socket buffer
2706          * address from the socket instead.
2707          */
2708         wchan = (sb->sb_flags & SB_RECV) ?
2709             &so->so_rcv.sb_flags : &so->so_snd.sb_flags;
2710
2711         /*
2712          * A content filter thread has exclusive access to the sockbuf
2713          * until it clears the
2714          */
2715         while ((sb->sb_flags & SB_LOCK) ||
2716             ((so->so_flags & SOF_CONTENT_FILTER) &&
2717             sb->sb_cfil_thread != NULL)) {
2718                 lck_mtx_t *mutex_held;
2719
2720                 /*
2721                  * XXX: This code should be moved up above outside of this loop;
2722                  * however, we may get here as part of sofreelastref(), and
2723                  * at that time pr_getlock() may no longer be able to return
2724                  * us the lock.  This will be fixed in future.
2725                  */
2726                 if (so->so_proto->pr_getlock != NULL) {
2727                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2728                 } else {
2729                         mutex_held = so->so_proto->pr_domain->dom_mtx;
2730                 }
2731
2732                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2733
2734                 sb->sb_wantlock++;
2735                 VERIFY(sb->sb_wantlock != 0);
2736
2737                 error = msleep(wchan, mutex_held,
2738                     nointr ? PSOCK : PSOCK | PCATCH,
2739                     nointr ? "sb_lock_nointr" : "sb_lock", NULL);
2740
2741                 VERIFY(sb->sb_wantlock != 0);
2742                 sb->sb_wantlock--;
2743
2744                 if (error == 0 && (so->so_flags & SOF_DEFUNCT) &&
2745                     !(flags & SBL_IGNDEFUNCT)) {
2746                         error = EBADF;
2747                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
2748                             "(%d)\n", __func__, proc_selfpid(),
2749                             proc_best_name(current_proc()),
2750                             (uint64_t)VM_KERNEL_ADDRPERM(so),
2751                             SOCK_DOM(so), SOCK_TYPE(so), error);
2752                 }
2753
2754                 if (error != 0) {
2755                         return error;
2756                 }
2757         }
2758         sb->sb_flags |= SB_LOCK;
2759         return 0;
2760 }
2761
2762 /*
2763  * Release lock on sockbuf sb
2764  */
2765 void
2766 sbunlock(struct sockbuf *sb, boolean_t keeplocked)
2767 {
2768         void *lr_saved = __builtin_return_address(0);
2769         struct socket *so = sb->sb_so;
2770         thread_t tp = current_thread();
2771
2772         /* so_usecount may be 0 if we get here from sofreelastref() */
2773         if (so == NULL) {
2774                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2775                     __func__, sb, sb->sb_flags, lr_saved);
2776                 /* NOTREACHED */
2777         } else if (so->so_usecount < 0) {
2778                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2779                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2780                     so->so_usecount, lr_saved, solockhistory_nr(so));
2781                 /* NOTREACHED */
2782         }
2783
2784         /*
2785          * The content filter thread must hold the sockbuf lock
2786          */
2787         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2788                 /*
2789                  * Don't panic if we are defunct because SB_LOCK has
2790                  * been cleared by sodefunct()
2791                  */
2792                 if (!(so->so_flags & SOF_DEFUNCT) &&
2793                     !(sb->sb_flags & SB_LOCK) &&
2794                     !(so->so_state & SS_DEFUNCT) &&
2795                     !(so->so_flags1 & SOF1_DEFUNCTINPROG)) {
2796                         panic("%s: SB_LOCK not held for %p\n",
2797                             __func__, sb);
2798                 }
2799                 /* Keep the sockbuf locked and proceed */
2800         } else {
2801                 VERIFY((sb->sb_flags & SB_LOCK) ||
2802                     (so->so_state & SS_DEFUNCT) ||
2803                     (so->so_flags1 & SOF1_DEFUNCTINPROG));
2804
2805                 sb->sb_flags &= ~SB_LOCK;
2806
2807                 if (sb->sb_wantlock > 0) {
2808                         /*
2809                          * We may get here from sorflush(), in which case "sb"
2810                          * may not point to the real socket buffer.  Use the
2811                          * actual socket buffer address from the socket instead.
2812                          */
2813                         wakeup((sb->sb_flags & SB_RECV) ? &so->so_rcv.sb_flags :
2814                             &so->so_snd.sb_flags);
2815                 }
2816         }
2817
2818         if (!keeplocked) {      /* unlock on exit */
2819                 if (so->so_flags & SOF_MP_SUBFLOW || SOCK_DOM(so) == PF_MULTIPATH) {
2820                         (*so->so_proto->pr_unlock)(so, 1, lr_saved);
2821                 } else {
2822                         lck_mtx_t *mutex_held;
2823
2824                         if (so->so_proto->pr_getlock != NULL) {
2825                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2826                         } else {
2827                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2828                         }
2829
2830                         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2831
2832                         VERIFY(so->so_usecount > 0);
2833                         so->so_usecount--;
2834                         so->unlock_lr[so->next_unlock_lr] = lr_saved;
2835                         so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
2836                         lck_mtx_unlock(mutex_held);
2837                 }
2838         }
2839 }
2840
2841 void
2842 sorwakeup(struct socket *so)
2843 {
2844         if (sb_notify(&so->so_rcv)) {
2845                 sowakeup(so, &so->so_rcv, NULL);
2846         }
2847 }
2848
2849 void
2850 sowwakeup(struct socket *so)
2851 {
2852         if (sb_notify(&so->so_snd)) {
2853                 sowakeup(so, &so->so_snd, NULL);
2854         }
2855 }
2856
2857 void
2858 soevent(struct socket *so, long hint)
2859 {
2860         if (so->so_flags & SOF_KNOTE) {
2861                 KNOTE(&so->so_klist, hint);
2862         }
2863
2864         soevupcall(so, hint);
2865
2866         /*
2867          * Don't post an event if this a subflow socket or
2868          * the app has opted out of using cellular interface
2869          */
2870         if ((hint & SO_FILT_HINT_IFDENIED) &&
2871             !(so->so_flags & SOF_MP_SUBFLOW) &&
2872             !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) &&
2873             !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE) &&
2874             !(so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
2875                 soevent_ifdenied(so);
2876         }
2877 }
2878
2879 void
2880 soevupcall(struct socket *so, u_int32_t hint)
2881 {
2882         if (so->so_event != NULL) {
2883                 caddr_t so_eventarg = so->so_eventarg;
2884
2885                 hint &= so->so_eventmask;
2886                 if (hint != 0) {
2887                         so->so_event(so, so_eventarg, hint);
2888                 }
2889         }
2890 }
2891
2892 static void
2893 soevent_ifdenied(struct socket *so)
2894 {
2895         struct kev_netpolicy_ifdenied ev_ifdenied;
2896
2897         bzero(&ev_ifdenied, sizeof(ev_ifdenied));
2898         /*
2899          * The event consumer is interested about the effective {upid,pid,uuid}
2900          * info which can be different than the those related to the process
2901          * that recently performed a system call on the socket, i.e. when the
2902          * socket is delegated.
2903          */
2904         if (so->so_flags & SOF_DELEGATED) {
2905                 ev_ifdenied.ev_data.eupid = so->e_upid;
2906                 ev_ifdenied.ev_data.epid = so->e_pid;
2907                 uuid_copy(ev_ifdenied.ev_data.euuid, so->e_uuid);
2908         } else {
2909                 ev_ifdenied.ev_data.eupid = so->last_upid;
2910                 ev_ifdenied.ev_data.epid = so->last_pid;
2911                 uuid_copy(ev_ifdenied.ev_data.euuid, so->last_uuid);
2912         }
2913
2914         if (++so->so_ifdenied_notifies > 1) {
2915                 /*
2916                  * Allow for at most one kernel event to be generated per
2917                  * socket; so_ifdenied_notifies is reset upon changes in
2918                  * the UUID policy.  See comments in inp_update_policy.
2919                  */
2920                 if (net_io_policy_log) {
2921                         uuid_string_t buf;
2922
2923                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2924                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2925                             "euuid %s%s has %d redundant events supressed\n",
2926                             __func__, so->last_pid,
2927                             (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
2928                             SOCK_TYPE(so), ev_ifdenied.ev_data.epid, buf,
2929                             ((so->so_flags & SOF_DELEGATED) ?
2930                             " [delegated]" : ""), so->so_ifdenied_notifies);
2931                 }
2932         } else {
2933                 if (net_io_policy_log) {
2934                         uuid_string_t buf;
2935
2936                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2937                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2938                             "euuid %s%s event posted\n", __func__,
2939                             so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
2940                             SOCK_DOM(so), SOCK_TYPE(so),
2941                             ev_ifdenied.ev_data.epid, buf,
2942                             ((so->so_flags & SOF_DELEGATED) ?
2943                             " [delegated]" : ""));
2944                 }
2945                 netpolicy_post_msg(KEV_NETPOLICY_IFDENIED, &ev_ifdenied.ev_data,
2946                     sizeof(ev_ifdenied));
2947         }
2948 }
2949
2950 /*
2951  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2952  */
2953 struct sockaddr *
2954 dup_sockaddr(struct sockaddr *sa, int canwait)
2955 {
2956         struct sockaddr *sa2;
2957
2958         MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
2959             canwait ? M_WAITOK : M_NOWAIT);
2960         if (sa2) {
2961                 bcopy(sa, sa2, sa->sa_len);
2962         }
2963         return sa2;
2964 }
2965
2966 /*
2967  * Create an external-format (``xsocket'') structure using the information
2968  * in the kernel-format socket structure pointed to by so.  This is done
2969  * to reduce the spew of irrelevant information over this interface,
2970  * to isolate user code from changes in the kernel structure, and
2971  * potentially to provide information-hiding if we decide that
2972  * some of this information should be hidden from users.
2973  */
2974 void
2975 sotoxsocket(struct socket *so, struct xsocket *xso)
2976 {
2977         xso->xso_len = sizeof(*xso);
2978         xso->xso_so = (_XSOCKET_PTR(struct socket *))VM_KERNEL_ADDRPERM(so);
2979         xso->so_type = so->so_type;
2980         xso->so_options = (short)(so->so_options & 0xffff);
2981         xso->so_linger = so->so_linger;
2982         xso->so_state = so->so_state;
2983         xso->so_pcb = (_XSOCKET_PTR(caddr_t))VM_KERNEL_ADDRPERM(so->so_pcb);
2984         if (so->so_proto) {
2985                 xso->xso_protocol = SOCK_PROTO(so);
2986                 xso->xso_family = SOCK_DOM(so);
2987         } else {
2988                 xso->xso_protocol = xso->xso_family = 0;
2989         }
2990         xso->so_qlen = so->so_qlen;
2991         xso->so_incqlen = so->so_incqlen;
2992         xso->so_qlimit = so->so_qlimit;
2993         xso->so_timeo = so->so_timeo;
2994         xso->so_error = so->so_error;
2995         xso->so_pgid = so->so_pgid;
2996         xso->so_oobmark = so->so_oobmark;
2997         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2998         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2999         xso->so_uid = kauth_cred_getuid(so->so_cred);
3000 }
3001
3002
3003 #if !CONFIG_EMBEDDED
3004
3005 void
3006 sotoxsocket64(struct socket *so, struct xsocket64 *xso)
3007 {
3008         xso->xso_len = sizeof(*xso);
3009         xso->xso_so = (u_int64_t)VM_KERNEL_ADDRPERM(so);
3010         xso->so_type = so->so_type;
3011         xso->so_options = (short)(so->so_options & 0xffff);
3012         xso->so_linger = so->so_linger;
3013         xso->so_state = so->so_state;
3014         xso->so_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
3015         if (so->so_proto) {
3016                 xso->xso_protocol = SOCK_PROTO(so);
3017                 xso->xso_family = SOCK_DOM(so);
3018         } else {
3019                 xso->xso_protocol = xso->xso_family = 0;
3020         }
3021         xso->so_qlen = so->so_qlen;
3022         xso->so_incqlen = so->so_incqlen;
3023         xso->so_qlimit = so->so_qlimit;
3024         xso->so_timeo = so->so_timeo;
3025         xso->so_error = so->so_error;
3026         xso->so_pgid = so->so_pgid;
3027         xso->so_oobmark = so->so_oobmark;
3028         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3029         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3030         xso->so_uid = kauth_cred_getuid(so->so_cred);
3031 }
3032
3033 #endif /* !CONFIG_EMBEDDED */
3034
3035 /*
3036  * This does the same for sockbufs.  Note that the xsockbuf structure,
3037  * since it is always embedded in a socket, does not include a self
3038  * pointer nor a length.  We make this entry point public in case
3039  * some other mechanism needs it.
3040  */
3041 void
3042 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
3043 {
3044         xsb->sb_cc = sb->sb_cc;
3045         xsb->sb_hiwat = sb->sb_hiwat;
3046         xsb->sb_mbcnt = sb->sb_mbcnt;
3047         xsb->sb_mbmax = sb->sb_mbmax;
3048         xsb->sb_lowat = sb->sb_lowat;
3049         xsb->sb_flags = sb->sb_flags;
3050         xsb->sb_timeo = (short)
3051             (sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick;
3052         if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) {
3053                 xsb->sb_timeo = 1;
3054         }
3055 }
3056
3057 /*
3058  * Based on the policy set by an all knowing decison maker, throttle sockets
3059  * that either have been marked as belonging to "background" process.
3060  */
3061 inline int
3062 soisthrottled(struct socket *so)
3063 {
3064         return so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND;
3065 }
3066
3067 inline int
3068 soisprivilegedtraffic(struct socket *so)
3069 {
3070         return (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS) ? 1 : 0;
3071 }
3072
3073 inline int
3074 soissrcbackground(struct socket *so)
3075 {
3076         return (so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND) ||
3077                IS_SO_TC_BACKGROUND(so->so_traffic_class);
3078 }
3079
3080 inline int
3081 soissrcrealtime(struct socket *so)
3082 {
3083         return so->so_traffic_class >= SO_TC_AV &&
3084                so->so_traffic_class <= SO_TC_VO;
3085 }
3086
3087 inline int
3088 soissrcbesteffort(struct socket *so)
3089 {
3090         return so->so_traffic_class == SO_TC_BE ||
3091                so->so_traffic_class == SO_TC_RD ||
3092                so->so_traffic_class == SO_TC_OAM;
3093 }
3094
3095 void
3096 soclearfastopen(struct socket *so)
3097 {
3098         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
3099                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
3100         }
3101
3102         if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
3103                 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
3104         }
3105 }
3106
3107 void
3108 sonullevent(struct socket *so, void *arg, uint32_t hint)
3109 {
3110 #pragma unused(so, arg, hint)
3111 }
3112
3113 /*
3114  * Here is the definition of some of the basic objects in the kern.ipc
3115  * branch of the MIB.
3116  */
3117 SYSCTL_NODE(_kern, KERN_IPC, ipc,
3118     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "IPC");
3119
3120 /* Check that the maximum socket buffer size is within a range */
3121
3122 static int
3123 sysctl_sb_max SYSCTL_HANDLER_ARGS
3124 {
3125 #pragma unused(oidp, arg1, arg2)
3126         u_int32_t new_value;
3127         int changed = 0;
3128         int error = sysctl_io_number(req, sb_max, sizeof(u_int32_t),
3129             &new_value, &changed);
3130         if (!error && changed) {
3131                 if (new_value > LOW_SB_MAX && new_value <= high_sb_max) {
3132                         sb_max = new_value;
3133                 } else {
3134                         error = ERANGE;
3135                 }
3136         }
3137         return error;
3138 }
3139
3140 SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf,
3141     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3142     &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size");
3143
3144 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor,
3145     CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, "");
3146
3147 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters,
3148     CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, "");
3149
3150 SYSCTL_INT(_kern_ipc, OID_AUTO, njcl,
3151     CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, "");
3152
3153 SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes,
3154     CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, "");
3155
3156 SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat,
3157     CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1,
3158     "Enable socket queue limit compatibility");
3159
3160 /*
3161  * Hack alert -- rdar://33572856
3162  * A loopback test we cannot change was failing because it sets
3163  * SO_SENDTIMEO to 5 seconds and that's also the value
3164  * of the minimum persist timer. Because of the persist timer,
3165  * the connection was not idle for 5 seconds and SO_SNDTIMEO
3166  * was not triggering at 5 seconds causing the test failure.
3167  * As a workaround we check the sysctl soqlencomp the test is already
3168  * setting to set disable auto tuning of the receive buffer.
3169  */
3170
3171 extern u_int32_t tcp_do_autorcvbuf;
3172
3173 static int
3174 sysctl_soqlencomp SYSCTL_HANDLER_ARGS
3175 {
3176 #pragma unused(oidp, arg1, arg2)
3177         u_int32_t new_value;
3178         int changed = 0;
3179         int error = sysctl_io_number(req, soqlencomp, sizeof(u_int32_t),
3180             &new_value, &changed);
3181         if (!error && changed) {
3182                 soqlencomp = new_value;
3183                 if (new_value != 0) {
3184                         tcp_do_autorcvbuf = 0;
3185                         tcptv_persmin_val = 6 * TCP_RETRANSHZ;
3186                 }
3187         }
3188         return error;
3189 }
3190 SYSCTL_PROC(_kern_ipc, OID_AUTO, soqlencomp,
3191     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3192     &soqlencomp, 0, &sysctl_soqlencomp, "IU", "");
3193
3194 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3195     &total_sbmb_cnt, 0, "");
3196 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_peak, CTLFLAG_RD | CTLFLAG_LOCKED,
3197     &total_sbmb_cnt_peak, 0, "");
3198 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_floor, CTLFLAG_RD | CTLFLAG_LOCKED,
3199     &total_sbmb_cnt_floor, 0, "");
3200 SYSCTL_QUAD(_kern_ipc, OID_AUTO, sbmb_limreached, CTLFLAG_RD | CTLFLAG_LOCKED,
3201     &sbmb_limreached, "");
3202
3203
3204 SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy");
3205
3206 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
3207     &net_io_policy_log, 0, "");
3208
3209 #if CONFIG_PROC_UUID_POLICY
3210 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, uuid, CTLFLAG_RW | CTLFLAG_LOCKED,
3211     &net_io_policy_uuid, 0, "");
3212 #endif /* CONFIG_PROC_UUID_POLICY */