bsd/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/domain.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/malloc.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/mcache.h>
  79 #include <sys/protosw.h>
  80 #include <sys/stat.h>
  81 #include <sys/socket.h>
  82 #include <sys/socketvar.h>
  83 #include <sys/signalvar.h>
  84 #include <sys/sysctl.h>
  85 #include <sys/syslog.h>
  86 #include <sys/ev.h>
  87 #include <kern/locks.h>
  88 #include <net/route.h>
  89 #include <net/content_filter.h>
  90 #include <netinet/in.h>
  91 #include <netinet/in_pcb.h>
  92 #include <netinet/tcp_var.h>
  93 #include <sys/kdebug.h>
  94 #include <libkern/OSAtomic.h>
  95
  96 #if CONFIG_MACF
  97 #include <security/mac_framework.h>
  98 #endif
  99
 100 #include <mach/vm_param.h>
 101
 102 #if MPTCP
 103 #include <netinet/mptcp_var.h>
 104 #endif
 105
 106 #define DBG_FNC_SBDROP          NETDBG_CODE(DBG_NETSOCK, 4)
 107 #define DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
 108
 109 SYSCTL_DECL(_kern_ipc);
 110
 111 __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0;
 112 SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort,
 113     CTLFLAG_RW | CTLFLAG_LOCKED, &net_io_policy_throttle_best_effort, 0, "");
 114
 115 static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
 116 static struct socket *sonewconn_internal(struct socket *, int);
 117 static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
 118     struct mbuf *);
 119 static void soevent_ifdenied(struct socket *);
 120
 121 /*
 122  * Primitive routines for operating on sockets and socket buffers
 123  */
 124 static int soqlimitcompat = 1;
 125 static int soqlencomp = 0;
 126
 127 /*
 128  * Based on the number of mbuf clusters configured, high_sb_max and sb_max can
 129  * get scaled up or down to suit that memory configuration. high_sb_max is a
 130  * higher limit on sb_max that is checked when sb_max gets set through sysctl.
 131  */
 132
 133 u_int32_t       sb_max = SB_MAX;                /* XXX should be static */
 134 u_int32_t       high_sb_max = SB_MAX;
 135
 136 static  u_int32_t sb_efficiency = 8;    /* parameter for sbreserve() */
 137 int32_t total_sbmb_cnt __attribute__((aligned(8))) = 0;
 138 int32_t total_sbmb_cnt_floor __attribute__((aligned(8))) = 0;
 139 int32_t total_sbmb_cnt_peak __attribute__((aligned(8))) = 0;
 140 int64_t sbmb_limreached __attribute__((aligned(8))) = 0;
 141
 142 u_int32_t net_io_policy_log = 0;        /* log socket policy changes */
 143 #if CONFIG_PROC_UUID_POLICY
 144 u_int32_t net_io_policy_uuid = 1;       /* enable UUID socket policy */
 145 #endif /* CONFIG_PROC_UUID_POLICY */
 146
 147 /*
 148  * Procedures to manipulate state flags of socket
 149  * and do appropriate wakeups.  Normal sequence from the
 150  * active (originating) side is that soisconnecting() is
 151  * called during processing of connect() call,
 152  * resulting in an eventual call to soisconnected() if/when the
 153  * connection is established.  When the connection is torn down
 154  * soisdisconnecting() is called during processing of disconnect() call,
 155  * and soisdisconnected() is called when the connection to the peer
 156  * is totally severed.  The semantics of these routines are such that
 157  * connectionless protocols can call soisconnected() and soisdisconnected()
 158  * only, bypassing the in-progress calls when setting up a ``connection''
 159  * takes no time.
 160  *
 161  * From the passive side, a socket is created with
 162  * two queues of sockets: so_incomp for connections in progress
 163  * and so_comp for connections already made and awaiting user acceptance.
 164  * As a protocol is preparing incoming connections, it creates a socket
 165  * structure queued on so_incomp by calling sonewconn().  When the connection
 166  * is established, soisconnected() is called, and transfers the
 167  * socket structure to so_comp, making it available to accept().
 168  *
 169  * If a socket is closed with sockets on either
 170  * so_incomp or so_comp, these sockets are dropped.
 171  *
 172  * If higher level protocols are implemented in
 173  * the kernel, the wakeups done here will sometimes
 174  * cause software-interrupt process scheduling.
 175  */
 176 void
 177 soisconnecting(struct socket *so)
 178 {
 179         so->so_state &= ~(SS_ISCONNECTED | SS_ISDISCONNECTING);
 180         so->so_state |= SS_ISCONNECTING;
 181
 182         sflt_notify(so, sock_evt_connecting, NULL);
 183 }
 184
 185 void
 186 soisconnected(struct socket *so)
 187 {
 188         so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING);
 189         so->so_state |= SS_ISCONNECTED;
 190
 191         soreserve_preconnect(so, 0);
 192
 193         sflt_notify(so, sock_evt_connected, NULL);
 194
 195         if (so->so_head != NULL && (so->so_state & SS_INCOMP)) {
 196                 struct socket *head = so->so_head;
 197                 int locked = 0;
 198
 199                 /*
 200                  * Enforce lock order when the protocol has per socket locks
 201                  */
 202                 if (head->so_proto->pr_getlock != NULL) {
 203                         socket_lock(head, 1);
 204                         so_acquire_accept_list(head, so);
 205                         locked = 1;
 206                 }
 207                 if (so->so_head == head && (so->so_state & SS_INCOMP)) {
 208                         so->so_state &= ~SS_INCOMP;
 209                         so->so_state |= SS_COMP;
 210                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 211                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 212                         head->so_incqlen--;
 213
 214                         /*
 215                          * We have to release the accept list in
 216                          * case a socket callback calls sock_accept()
 217                          */
 218                         if (locked != 0) {
 219                                 so_release_accept_list(head);
 220                                 socket_unlock(so, 0);
 221                         }
 222                         postevent(head, 0, EV_RCONN);
 223                         sorwakeup(head);
 224                         wakeup_one((caddr_t)&head->so_timeo);
 225
 226                         if (locked != 0) {
 227                                 socket_unlock(head, 1);
 228                                 socket_lock(so, 0);
 229                         }
 230                 } else if (locked != 0) {
 231                         so_release_accept_list(head);
 232                         socket_unlock(head, 1);
 233                 }
 234         } else {
 235                 postevent(so, 0, EV_WCONN);
 236                 wakeup((caddr_t)&so->so_timeo);
 237                 sorwakeup(so);
 238                 sowwakeup(so);
 239                 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNECTED |
 240                     SO_FILT_HINT_CONNINFO_UPDATED);
 241         }
 242 }
 243
 244 boolean_t
 245 socanwrite(struct socket *so)
 246 {
 247         return (so->so_state & SS_ISCONNECTED) ||
 248                !(so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 249                (so->so_flags1 & SOF1_PRECONNECT_DATA);
 250 }
 251
 252 void
 253 soisdisconnecting(struct socket *so)
 254 {
 255         so->so_state &= ~SS_ISCONNECTING;
 256         so->so_state |= (SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE);
 257         soevent(so, SO_FILT_HINT_LOCKED);
 258         sflt_notify(so, sock_evt_disconnecting, NULL);
 259         wakeup((caddr_t)&so->so_timeo);
 260         sowwakeup(so);
 261         sorwakeup(so);
 262 }
 263
 264 void
 265 soisdisconnected(struct socket *so)
 266 {
 267         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 268         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 269         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 270             SO_FILT_HINT_CONNINFO_UPDATED);
 271         sflt_notify(so, sock_evt_disconnected, NULL);
 272         wakeup((caddr_t)&so->so_timeo);
 273         sowwakeup(so);
 274         sorwakeup(so);
 275
 276 #if CONTENT_FILTER
 277         /* Notify content filters as soon as we cannot send/receive data */
 278         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 279 #endif /* CONTENT_FILTER */
 280 }
 281
 282 /*
 283  * This function will issue a wakeup like soisdisconnected but it will not
 284  * notify the socket filters. This will avoid unlocking the socket
 285  * in the midst of closing it.
 286  */
 287 void
 288 sodisconnectwakeup(struct socket *so)
 289 {
 290         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 291         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 292         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 293             SO_FILT_HINT_CONNINFO_UPDATED);
 294         wakeup((caddr_t)&so->so_timeo);
 295         sowwakeup(so);
 296         sorwakeup(so);
 297
 298 #if CONTENT_FILTER
 299         /* Notify content filters as soon as we cannot send/receive data */
 300         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 301 #endif /* CONTENT_FILTER */
 302 }
 303
 304 /*
 305  * When an attempt at a new connection is noted on a socket
 306  * which accepts connections, sonewconn is called.  If the
 307  * connection is possible (subject to space constraints, etc.)
 308  * then we allocate a new structure, propoerly linked into the
 309  * data structure of the original socket, and return this.
 310  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 311  */
 312 static struct socket *
 313 sonewconn_internal(struct socket *head, int connstatus)
 314 {
 315         int so_qlen, error = 0;
 316         struct socket *so;
 317         lck_mtx_t *mutex_held;
 318
 319         if (head->so_proto->pr_getlock != NULL) {
 320                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 321         } else {
 322                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 323         }
 324         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 325
 326         if (!soqlencomp) {
 327                 /*
 328                  * This is the default case; so_qlen represents the
 329                  * sum of both incomplete and completed queues.
 330                  */
 331                 so_qlen = head->so_qlen;
 332         } else {
 333                 /*
 334                  * When kern.ipc.soqlencomp is set to 1, so_qlen
 335                  * represents only the completed queue.  Since we
 336                  * cannot let the incomplete queue goes unbounded
 337                  * (in case of SYN flood), we cap the incomplete
 338                  * queue length to at most somaxconn, and use that
 339                  * as so_qlen so that we fail immediately below.
 340                  */
 341                 so_qlen = head->so_qlen - head->so_incqlen;
 342                 if (head->so_incqlen > somaxconn) {
 343                         so_qlen = somaxconn;
 344                 }
 345         }
 346
 347         if (so_qlen >=
 348             (soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2))) {
 349                 return (struct socket *)0;
 350         }
 351         so = soalloc(1, SOCK_DOM(head), head->so_type);
 352         if (so == NULL) {
 353                 return (struct socket *)0;
 354         }
 355         /* check if head was closed during the soalloc */
 356         if (head->so_proto == NULL) {
 357                 sodealloc(so);
 358                 return (struct socket *)0;
 359         }
 360
 361         so->so_type = head->so_type;
 362         so->so_options = head->so_options & ~SO_ACCEPTCONN;
 363         so->so_linger = head->so_linger;
 364         so->so_state = head->so_state | SS_NOFDREF;
 365         so->so_proto = head->so_proto;
 366         so->so_timeo = head->so_timeo;
 367         so->so_pgid  = head->so_pgid;
 368         kauth_cred_ref(head->so_cred);
 369         so->so_cred = head->so_cred;
 370         so->last_pid = head->last_pid;
 371         so->last_upid = head->last_upid;
 372         memcpy(so->last_uuid, head->last_uuid, sizeof(so->last_uuid));
 373         if (head->so_flags & SOF_DELEGATED) {
 374                 so->e_pid = head->e_pid;
 375                 so->e_upid = head->e_upid;
 376                 memcpy(so->e_uuid, head->e_uuid, sizeof(so->e_uuid));
 377         }
 378         /* inherit socket options stored in so_flags */
 379         so->so_flags = head->so_flags &
 380             (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID |
 381             SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT |
 382             SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS | SOF_NOTSENT_LOWAT |
 383             SOF_USELRO | SOF_DELEGATED);
 384         so->so_usecount = 1;
 385         so->next_lock_lr = 0;
 386         so->next_unlock_lr = 0;
 387
 388         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 389         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 390         TAILQ_INIT(&so->so_evlist);
 391
 392 #if CONFIG_MACF_SOCKET
 393         mac_socket_label_associate_accept(head, so);
 394 #endif
 395
 396         /* inherit traffic management properties of listener */
 397         so->so_flags1 |=
 398             head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND);
 399         so->so_background_thread = head->so_background_thread;
 400         so->so_traffic_class = head->so_traffic_class;
 401
 402         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 403                 sodealloc(so);
 404                 return (struct socket *)0;
 405         }
 406         so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE);
 407         so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE);
 408
 409         /*
 410          * Must be done with head unlocked to avoid deadlock
 411          * for protocol with per socket mutexes.
 412          */
 413         if (head->so_proto->pr_unlock) {
 414                 socket_unlock(head, 0);
 415         }
 416         if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) ||
 417             error) {
 418                 sodealloc(so);
 419                 if (head->so_proto->pr_unlock) {
 420                         socket_lock(head, 0);
 421                 }
 422                 return (struct socket *)0;
 423         }
 424         if (head->so_proto->pr_unlock) {
 425                 socket_lock(head, 0);
 426                 /*
 427                  * Radar 7385998 Recheck that the head is still accepting
 428                  * to avoid race condition when head is getting closed.
 429                  */
 430                 if ((head->so_options & SO_ACCEPTCONN) == 0) {
 431                         so->so_state &= ~SS_NOFDREF;
 432                         soclose(so);
 433                         return (struct socket *)0;
 434                 }
 435         }
 436
 437         atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1);
 438
 439         /* Insert in head appropriate lists */
 440         so_acquire_accept_list(head, NULL);
 441
 442         so->so_head = head;
 443
 444         /*
 445          * Since this socket is going to be inserted into the incomp
 446          * queue, it can be picked up by another thread in
 447          * tcp_dropdropablreq to get dropped before it is setup..
 448          * To prevent this race, set in-progress flag which can be
 449          * cleared later
 450          */
 451         so->so_flags |= SOF_INCOMP_INPROGRESS;
 452
 453         if (connstatus) {
 454                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 455                 so->so_state |= SS_COMP;
 456         } else {
 457                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 458                 so->so_state |= SS_INCOMP;
 459                 head->so_incqlen++;
 460         }
 461         head->so_qlen++;
 462
 463         so_release_accept_list(head);
 464
 465         /* Attach socket filters for this protocol */
 466         sflt_initsock(so);
 467
 468         if (connstatus) {
 469                 so->so_state |= connstatus;
 470                 sorwakeup(head);
 471                 wakeup((caddr_t)&head->so_timeo);
 472         }
 473         return so;
 474 }
 475
 476
 477 struct socket *
 478 sonewconn(struct socket *head, int connstatus, const struct sockaddr *from)
 479 {
 480         int error = sflt_connectin(head, from);
 481         if (error) {
 482                 return NULL;
 483         }
 484
 485         return sonewconn_internal(head, connstatus);
 486 }
 487
 488 /*
 489  * Socantsendmore indicates that no more data will be sent on the
 490  * socket; it would normally be applied to a socket when the user
 491  * informs the system that no more data is to be sent, by the protocol
 492  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 493  * will be received, and will normally be applied to the socket by a
 494  * protocol when it detects that the peer will send no more data.
 495  * Data queued for reading in the socket may yet be read.
 496  */
 497
 498 void
 499 socantsendmore(struct socket *so)
 500 {
 501         so->so_state |= SS_CANTSENDMORE;
 502         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTSENDMORE);
 503         sflt_notify(so, sock_evt_cantsendmore, NULL);
 504         sowwakeup(so);
 505 }
 506
 507 void
 508 socantrcvmore(struct socket *so)
 509 {
 510         so->so_state |= SS_CANTRCVMORE;
 511         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
 512         sflt_notify(so, sock_evt_cantrecvmore, NULL);
 513         sorwakeup(so);
 514 }
 515
 516 /*
 517  * Wait for data to arrive at/drain from a socket buffer.
 518  */
 519 int
 520 sbwait(struct sockbuf *sb)
 521 {
 522         boolean_t nointr = (sb->sb_flags & SB_NOINTR);
 523         void *lr_saved = __builtin_return_address(0);
 524         struct socket *so = sb->sb_so;
 525         lck_mtx_t *mutex_held;
 526         struct timespec ts;
 527         int error = 0;
 528
 529         if (so == NULL) {
 530                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
 531                     __func__, sb, sb->sb_flags, lr_saved);
 532                 /* NOTREACHED */
 533         } else if (so->so_usecount < 1) {
 534                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 535                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 536                     so->so_usecount, lr_saved, solockhistory_nr(so));
 537                 /* NOTREACHED */
 538         }
 539
 540         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 541                 error = EBADF;
 542                 if (so->so_flags & SOF_DEFUNCT) {
 543                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 544                             "(%d)\n", __func__, proc_selfpid(),
 545                             proc_best_name(current_proc()),
 546                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 547                             SOCK_DOM(so), SOCK_TYPE(so), error);
 548                 }
 549                 return error;
 550         }
 551
 552         if (so->so_proto->pr_getlock != NULL) {
 553                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
 554         } else {
 555                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 556         }
 557
 558         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 559
 560         ts.tv_sec = sb->sb_timeo.tv_sec;
 561         ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
 562
 563         sb->sb_waiters++;
 564         VERIFY(sb->sb_waiters != 0);
 565
 566         error = msleep((caddr_t)&sb->sb_cc, mutex_held,
 567             nointr ? PSOCK : PSOCK | PCATCH,
 568             nointr ? "sbwait_nointr" : "sbwait", &ts);
 569
 570         VERIFY(sb->sb_waiters != 0);
 571         sb->sb_waiters--;
 572
 573         if (so->so_usecount < 1) {
 574                 panic("%s: 2 sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 575                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 576                     so->so_usecount, lr_saved, solockhistory_nr(so));
 577                 /* NOTREACHED */
 578         }
 579
 580         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 581                 error = EBADF;
 582                 if (so->so_flags & SOF_DEFUNCT) {
 583                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 584                             "(%d)\n", __func__, proc_selfpid(),
 585                             proc_best_name(current_proc()),
 586                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 587                             SOCK_DOM(so), SOCK_TYPE(so), error);
 588                 }
 589         }
 590
 591         return error;
 592 }
 593
 594 void
 595 sbwakeup(struct sockbuf *sb)
 596 {
 597         if (sb->sb_waiters > 0) {
 598                 wakeup((caddr_t)&sb->sb_cc);
 599         }
 600 }
 601
 602 /*
 603  * Wakeup processes waiting on a socket buffer.
 604  * Do asynchronous notification via SIGIO
 605  * if the socket has the SS_ASYNC flag set.
 606  */
 607 void
 608 sowakeup(struct socket *so, struct sockbuf *sb)
 609 {
 610         if (so->so_flags & SOF_DEFUNCT) {
 611                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
 612                     "fl 0x%x [%s]\n", __func__, proc_selfpid(),
 613                     proc_best_name(current_proc()),
 614                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
 615                     SOCK_TYPE(so), (uint32_t)sb->sb_sel.si_flags, sb->sb_flags,
 616                     (sb->sb_flags & SB_RECV) ? "rcv" : "snd");
 617         }
 618
 619         sb->sb_flags &= ~SB_SEL;
 620         selwakeup(&sb->sb_sel);
 621         sbwakeup(sb);
 622         if (so->so_state & SS_ASYNC) {
 623                 if (so->so_pgid < 0) {
 624                         gsignal(-so->so_pgid, SIGIO);
 625                 } else if (so->so_pgid > 0) {
 626                         proc_signal(so->so_pgid, SIGIO);
 627                 }
 628         }
 629         if (sb->sb_flags & SB_KNOTE) {
 630                 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
 631         }
 632         if (sb->sb_flags & SB_UPCALL) {
 633                 void (*sb_upcall)(struct socket *, void *, int);
 634                 caddr_t sb_upcallarg;
 635                 int lock = !(sb->sb_flags & SB_UPCALL_LOCK);
 636
 637                 sb_upcall = sb->sb_upcall;
 638                 sb_upcallarg = sb->sb_upcallarg;
 639                 /* Let close know that we're about to do an upcall */
 640                 so->so_upcallusecount++;
 641
 642                 if (lock) {
 643                         socket_unlock(so, 0);
 644                 }
 645                 (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT);
 646                 if (lock) {
 647                         socket_lock(so, 0);
 648                 }
 649
 650                 so->so_upcallusecount--;
 651                 /* Tell close that it's safe to proceed */
 652                 if ((so->so_flags & SOF_CLOSEWAIT) &&
 653                     so->so_upcallusecount == 0) {
 654                         wakeup((caddr_t)&so->so_upcallusecount);
 655                 }
 656         }
 657 #if CONTENT_FILTER
 658         /*
 659          * Trap disconnection events for content filters
 660          */
 661         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
 662                 if ((sb->sb_flags & SB_RECV)) {
 663                         if (so->so_state & (SS_CANTRCVMORE)) {
 664                                 cfil_sock_notify_shutdown(so, SHUT_RD);
 665                         }
 666                 } else {
 667                         if (so->so_state & (SS_CANTSENDMORE)) {
 668                                 cfil_sock_notify_shutdown(so, SHUT_WR);
 669                         }
 670                 }
 671         }
 672 #endif /* CONTENT_FILTER */
 673 }
 674
 675 /*
 676  * Socket buffer (struct sockbuf) utility routines.
 677  *
 678  * Each socket contains two socket buffers: one for sending data and
 679  * one for receiving data.  Each buffer contains a queue of mbufs,
 680  * information about the number of mbufs and amount of data in the
 681  * queue, and other fields allowing select() statements and notification
 682  * on data availability to be implemented.
 683  *
 684  * Data stored in a socket buffer is maintained as a list of records.
 685  * Each record is a list of mbufs chained together with the m_next
 686  * field.  Records are chained together with the m_nextpkt field. The upper
 687  * level routine soreceive() expects the following conventions to be
 688  * observed when placing information in the receive buffer:
 689  *
 690  * 1. If the protocol requires each message be preceded by the sender's
 691  *    name, then a record containing that name must be present before
 692  *    any associated data (mbuf's must be of type MT_SONAME).
 693  * 2. If the protocol supports the exchange of ``access rights'' (really
 694  *    just additional data associated with the message), and there are
 695  *    ``rights'' to be received, then a record containing this data
 696  *    should be present (mbuf's must be of type MT_RIGHTS).
 697  * 3. If a name or rights record exists, then it must be followed by
 698  *    a data record, perhaps of zero length.
 699  *
 700  * Before using a new socket structure it is first necessary to reserve
 701  * buffer space to the socket, by calling sbreserve().  This should commit
 702  * some of the available buffer space in the system buffer pool for the
 703  * socket (currently, it does nothing but enforce limits).  The space
 704  * should be released by calling sbrelease() when the socket is destroyed.
 705  */
 706
 707 /*
 708  * Returns:     0                       Success
 709  *              ENOBUFS
 710  */
 711 int
 712 soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc)
 713 {
 714         /*
 715          * We do not want to fail the creation of a socket
 716          * when kern.ipc.maxsockbuf is less than the
 717          * default socket buffer socket size of the protocol
 718          * so force the buffer sizes to be at most the
 719          * limit enforced by sbreserve()
 720          */
 721         uint64_t maxcc = (uint64_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 722         if (sndcc > maxcc) {
 723                 sndcc = maxcc;
 724         }
 725         if (rcvcc > maxcc) {
 726                 rcvcc = maxcc;
 727         }
 728         if (sbreserve(&so->so_snd, sndcc) == 0) {
 729                 goto bad;
 730         } else {
 731                 so->so_snd.sb_idealsize = sndcc;
 732         }
 733
 734         if (sbreserve(&so->so_rcv, rcvcc) == 0) {
 735                 goto bad2;
 736         } else {
 737                 so->so_rcv.sb_idealsize = rcvcc;
 738         }
 739
 740         if (so->so_rcv.sb_lowat == 0) {
 741                 so->so_rcv.sb_lowat = 1;
 742         }
 743         if (so->so_snd.sb_lowat == 0) {
 744                 so->so_snd.sb_lowat = MCLBYTES;
 745         }
 746         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) {
 747                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 748         }
 749         return 0;
 750 bad2:
 751         so->so_snd.sb_flags &= ~SB_SEL;
 752         selthreadclear(&so->so_snd.sb_sel);
 753         sbrelease(&so->so_snd);
 754 bad:
 755         return ENOBUFS;
 756 }
 757
 758 void
 759 soreserve_preconnect(struct socket *so, unsigned int pre_cc)
 760 {
 761         /* As of now, same bytes for both preconnect read and write */
 762         so->so_snd.sb_preconn_hiwat = pre_cc;
 763         so->so_rcv.sb_preconn_hiwat = pre_cc;
 764 }
 765
 766 /*
 767  * Allot mbufs to a sockbuf.
 768  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 769  * if buffering efficiency is near the normal case.
 770  */
 771 int
 772 sbreserve(struct sockbuf *sb, u_int32_t cc)
 773 {
 774         if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) {
 775                 return 0;
 776         }
 777         sb->sb_hiwat = cc;
 778         sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 779         if (sb->sb_lowat > sb->sb_hiwat) {
 780                 sb->sb_lowat = sb->sb_hiwat;
 781         }
 782         return 1;
 783 }
 784
 785 /*
 786  * Free mbufs held by a socket, and reserved mbuf space.
 787  */
 788 /*  WARNING needs to do selthreadclear() before calling this */
 789 void
 790 sbrelease(struct sockbuf *sb)
 791 {
 792         sbflush(sb);
 793         sb->sb_hiwat = 0;
 794         sb->sb_mbmax = 0;
 795 }
 796
 797 /*
 798  * Routines to add and remove
 799  * data from an mbuf queue.
 800  *
 801  * The routines sbappend() or sbappendrecord() are normally called to
 802  * append new mbufs to a socket buffer, after checking that adequate
 803  * space is available, comparing the function sbspace() with the amount
 804  * of data to be added.  sbappendrecord() differs from sbappend() in
 805  * that data supplied is treated as the beginning of a new record.
 806  * To place a sender's address, optional access rights, and data in a
 807  * socket receive buffer, sbappendaddr() should be used.  To place
 808  * access rights and data in a socket receive buffer, sbappendrights()
 809  * should be used.  In either case, the new data begins a new record.
 810  * Note that unlike sbappend() and sbappendrecord(), these routines check
 811  * for the caller that there will be enough space to store the data.
 812  * Each fails if there is not enough space, or if it cannot find mbufs
 813  * to store additional information in.
 814  *
 815  * Reliable protocols may use the socket send buffer to hold data
 816  * awaiting acknowledgement.  Data is normally copied from a socket
 817  * send buffer in a protocol with m_copy for output to a peer,
 818  * and then removing the data from the socket buffer with sbdrop()
 819  * or sbdroprecord() when the data is acknowledged by the peer.
 820  */
 821
 822 /*
 823  * Append mbuf chain m to the last record in the
 824  * socket buffer sb.  The additional space associated
 825  * the mbuf chain is recorded in sb.  Empty mbufs are
 826  * discarded and mbufs are compacted where possible.
 827  */
 828 int
 829 sbappend(struct sockbuf *sb, struct mbuf *m)
 830 {
 831         struct socket *so = sb->sb_so;
 832
 833         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 834                 if (m != NULL) {
 835                         m_freem(m);
 836                 }
 837                 return 0;
 838         }
 839
 840         SBLASTRECORDCHK(sb, "sbappend 1");
 841
 842         if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) {
 843                 return sbappendrecord(sb, m);
 844         }
 845
 846         if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 847                 int error = sflt_data_in(so, NULL, &m, NULL, 0);
 848                 SBLASTRECORDCHK(sb, "sbappend 2");
 849
 850 #if CONTENT_FILTER
 851                 if (error == 0) {
 852                         error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 853                 }
 854 #endif /* CONTENT_FILTER */
 855
 856                 if (error != 0) {
 857                         if (error != EJUSTRETURN) {
 858                                 m_freem(m);
 859                         }
 860                         return 0;
 861                 }
 862         } else if (m) {
 863                 m->m_flags &= ~M_SKIPCFIL;
 864         }
 865
 866         /* If this is the first record, it's also the last record */
 867         if (sb->sb_lastrecord == NULL) {
 868                 sb->sb_lastrecord = m;
 869         }
 870
 871         sbcompress(sb, m, sb->sb_mbtail);
 872         SBLASTRECORDCHK(sb, "sbappend 3");
 873         return 1;
 874 }
 875
 876 /*
 877  * Similar to sbappend, except that this is optimized for stream sockets.
 878  */
 879 int
 880 sbappendstream(struct sockbuf *sb, struct mbuf *m)
 881 {
 882         struct socket *so = sb->sb_so;
 883
 884         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 885                 if (m != NULL) {
 886                         m_freem(m);
 887                 }
 888                 return 0;
 889         }
 890
 891         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
 892                 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
 893                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
 894                 /* NOTREACHED */
 895         }
 896
 897         SBLASTMBUFCHK(sb, __func__);
 898
 899         if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 900                 int error = sflt_data_in(so, NULL, &m, NULL, 0);
 901                 SBLASTRECORDCHK(sb, "sbappendstream 1");
 902
 903 #if CONTENT_FILTER
 904                 if (error == 0) {
 905                         error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 906                 }
 907 #endif /* CONTENT_FILTER */
 908
 909                 if (error != 0) {
 910                         if (error != EJUSTRETURN) {
 911                                 m_freem(m);
 912                         }
 913                         return 0;
 914                 }
 915         } else if (m) {
 916                 m->m_flags &= ~M_SKIPCFIL;
 917         }
 918
 919         sbcompress(sb, m, sb->sb_mbtail);
 920         sb->sb_lastrecord = sb->sb_mb;
 921         SBLASTRECORDCHK(sb, "sbappendstream 2");
 922         return 1;
 923 }
 924
 925 #ifdef SOCKBUF_DEBUG
 926 void
 927 sbcheck(struct sockbuf *sb)
 928 {
 929         struct mbuf *m;
 930         struct mbuf *n = 0;
 931         u_int32_t len = 0, mbcnt = 0;
 932         lck_mtx_t *mutex_held;
 933
 934         if (sb->sb_so->so_proto->pr_getlock != NULL) {
 935                 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
 936         } else {
 937                 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
 938         }
 939
 940         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 941
 942         if (sbchecking == 0) {
 943                 return;
 944         }
 945
 946         for (m = sb->sb_mb; m; m = n) {
 947                 n = m->m_nextpkt;
 948                 for (; m; m = m->m_next) {
 949                         len += m->m_len;
 950                         mbcnt += MSIZE;
 951                         /* XXX pretty sure this is bogus */
 952                         if (m->m_flags & M_EXT) {
 953                                 mbcnt += m->m_ext.ext_size;
 954                         }
 955                 }
 956         }
 957         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
 958                 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
 959                     mbcnt, sb->sb_mbcnt);
 960         }
 961 }
 962 #endif
 963
 964 void
 965 sblastrecordchk(struct sockbuf *sb, const char *where)
 966 {
 967         struct mbuf *m = sb->sb_mb;
 968
 969         while (m && m->m_nextpkt) {
 970                 m = m->m_nextpkt;
 971         }
 972
 973         if (m != sb->sb_lastrecord) {
 974                 printf("sblastrecordchk: mb 0x%llx lastrecord 0x%llx "
 975                     "last 0x%llx\n",
 976                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
 977                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_lastrecord),
 978                     (uint64_t)VM_KERNEL_ADDRPERM(m));
 979                 printf("packet chain:\n");
 980                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
 981                         printf("\t0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(m));
 982                 }
 983                 panic("sblastrecordchk from %s", where);
 984         }
 985 }
 986
 987 void
 988 sblastmbufchk(struct sockbuf *sb, const char *where)
 989 {
 990         struct mbuf *m = sb->sb_mb;
 991         struct mbuf *n;
 992
 993         while (m && m->m_nextpkt) {
 994                 m = m->m_nextpkt;
 995         }
 996
 997         while (m && m->m_next) {
 998                 m = m->m_next;
 999         }
1000
1001         if (m != sb->sb_mbtail) {
1002                 printf("sblastmbufchk: mb 0x%llx mbtail 0x%llx last 0x%llx\n",
1003                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
1004                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mbtail),
1005                     (uint64_t)VM_KERNEL_ADDRPERM(m));
1006                 printf("packet tree:\n");
1007                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
1008                         printf("\t");
1009                         for (n = m; n != NULL; n = n->m_next) {
1010                                 printf("0x%llx ",
1011                                     (uint64_t)VM_KERNEL_ADDRPERM(n));
1012                         }
1013                         printf("\n");
1014                 }
1015                 panic("sblastmbufchk from %s", where);
1016         }
1017 }
1018
1019 /*
1020  * Similar to sbappend, except the mbuf chain begins a new record.
1021  */
1022 int
1023 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1024 {
1025         struct mbuf *m;
1026         int space = 0;
1027
1028         if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
1029                 if (m0 != NULL) {
1030                         m_freem(m0);
1031                 }
1032                 return 0;
1033         }
1034
1035         for (m = m0; m != NULL; m = m->m_next) {
1036                 space += m->m_len;
1037         }
1038
1039         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1040                 m_freem(m0);
1041                 return 0;
1042         }
1043
1044         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1045                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
1046                     sock_data_filt_flag_record);
1047
1048 #if CONTENT_FILTER
1049                 if (error == 0) {
1050                         error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
1051                 }
1052 #endif /* CONTENT_FILTER */
1053
1054                 if (error != 0) {
1055                         SBLASTRECORDCHK(sb, "sbappendrecord 1");
1056                         if (error != EJUSTRETURN) {
1057                                 m_freem(m0);
1058                         }
1059                         return 0;
1060                 }
1061         } else if (m0) {
1062                 m0->m_flags &= ~M_SKIPCFIL;
1063         }
1064
1065         /*
1066          * Note this permits zero length records.
1067          */
1068         sballoc(sb, m0);
1069         SBLASTRECORDCHK(sb, "sbappendrecord 2");
1070         if (sb->sb_lastrecord != NULL) {
1071                 sb->sb_lastrecord->m_nextpkt = m0;
1072         } else {
1073                 sb->sb_mb = m0;
1074         }
1075         sb->sb_lastrecord = m0;
1076         sb->sb_mbtail = m0;
1077
1078         m = m0->m_next;
1079         m0->m_next = 0;
1080         if (m && (m0->m_flags & M_EOR)) {
1081                 m0->m_flags &= ~M_EOR;
1082                 m->m_flags |= M_EOR;
1083         }
1084         sbcompress(sb, m, m0);
1085         SBLASTRECORDCHK(sb, "sbappendrecord 3");
1086         return 1;
1087 }
1088
1089 /*
1090  * As above except that OOB data
1091  * is inserted at the beginning of the sockbuf,
1092  * but after any other OOB data.
1093  */
1094 int
1095 sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
1096 {
1097         struct mbuf *m;
1098         struct mbuf **mp;
1099
1100         if (m0 == 0) {
1101                 return 0;
1102         }
1103
1104         SBLASTRECORDCHK(sb, "sbinsertoob 1");
1105
1106         if ((sb->sb_flags & SB_RECV && !(m0->m_flags & M_SKIPCFIL)) != 0) {
1107                 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
1108                     sock_data_filt_flag_oob);
1109
1110                 SBLASTRECORDCHK(sb, "sbinsertoob 2");
1111
1112 #if CONTENT_FILTER
1113                 if (error == 0) {
1114                         error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
1115                 }
1116 #endif /* CONTENT_FILTER */
1117
1118                 if (error) {
1119                         if (error != EJUSTRETURN) {
1120                                 m_freem(m0);
1121                         }
1122                         return 0;
1123                 }
1124         } else if (m0) {
1125                 m0->m_flags &= ~M_SKIPCFIL;
1126         }
1127
1128         for (mp = &sb->sb_mb; *mp; mp = &((*mp)->m_nextpkt)) {
1129                 m = *mp;
1130 again:
1131                 switch (m->m_type) {
1132                 case MT_OOBDATA:
1133                         continue;               /* WANT next train */
1134
1135                 case MT_CONTROL:
1136                         m = m->m_next;
1137                         if (m) {
1138                                 goto again;     /* inspect THIS train further */
1139                         }
1140                 }
1141                 break;
1142         }
1143         /*
1144          * Put the first mbuf on the queue.
1145          * Note this permits zero length records.
1146          */
1147         sballoc(sb, m0);
1148         m0->m_nextpkt = *mp;
1149         if (*mp == NULL) {
1150                 /* m0 is actually the new tail */
1151                 sb->sb_lastrecord = m0;
1152         }
1153         *mp = m0;
1154         m = m0->m_next;
1155         m0->m_next = 0;
1156         if (m && (m0->m_flags & M_EOR)) {
1157                 m0->m_flags &= ~M_EOR;
1158                 m->m_flags |= M_EOR;
1159         }
1160         sbcompress(sb, m, m0);
1161         SBLASTRECORDCHK(sb, "sbinsertoob 3");
1162         return 1;
1163 }
1164
1165 /*
1166  * Concatenate address (optional), control (optional) and data into one
1167  * single mbuf chain.  If sockbuf *sb is passed in, space check will be
1168  * performed.
1169  *
1170  * Returns:     mbuf chain pointer if succeeded, NULL if failed
1171  */
1172 struct mbuf *
1173 sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control)
1174 {
1175         struct mbuf *m = NULL, *n = NULL;
1176         int space = 0;
1177
1178         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1179                 panic("sbconcat_mbufs");
1180         }
1181
1182         if (m0) {
1183                 space += m0->m_pkthdr.len;
1184         }
1185         for (n = control; n; n = n->m_next) {
1186                 space += n->m_len;
1187                 if (n->m_next == 0) {   /* keep pointer to last control buf */
1188                         break;
1189                 }
1190         }
1191
1192         if (asa != NULL) {
1193                 if (asa->sa_len > MLEN) {
1194                         return NULL;
1195                 }
1196                 space += asa->sa_len;
1197         }
1198
1199         if (sb != NULL && space > sbspace(sb)) {
1200                 return NULL;
1201         }
1202
1203         if (n) {
1204                 n->m_next = m0;         /* concatenate data to control */
1205         } else {
1206                 control = m0;
1207         }
1208
1209         if (asa != NULL) {
1210                 MGET(m, M_DONTWAIT, MT_SONAME);
1211                 if (m == 0) {
1212                         if (n) {
1213                                 /* unchain control and data if necessary */
1214                                 n->m_next = NULL;
1215                         }
1216                         return NULL;
1217                 }
1218                 m->m_len = asa->sa_len;
1219                 bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
1220
1221                 m->m_next = control;
1222         } else {
1223                 m = control;
1224         }
1225
1226         return m;
1227 }
1228
1229 /*
1230  * Queue mbuf chain to the receive queue of a socket.
1231  * Parameter space is the total len of the mbuf chain.
1232  * If passed in, sockbuf space will be checked.
1233  *
1234  * Returns:     0               Invalid mbuf chain
1235  *                      1               Success
1236  */
1237 int
1238 sbappendchain(struct sockbuf *sb, struct mbuf *m, int space)
1239 {
1240         struct mbuf *n, *nlast;
1241
1242         if (m == NULL) {
1243                 return 0;
1244         }
1245
1246         if (space != 0 && space > sbspace(sb)) {
1247                 return 0;
1248         }
1249
1250         for (n = m; n->m_next != NULL; n = n->m_next) {
1251                 sballoc(sb, n);
1252         }
1253         sballoc(sb, n);
1254         nlast = n;
1255
1256         if (sb->sb_lastrecord != NULL) {
1257                 sb->sb_lastrecord->m_nextpkt = m;
1258         } else {
1259                 sb->sb_mb = m;
1260         }
1261         sb->sb_lastrecord = m;
1262         sb->sb_mbtail = nlast;
1263
1264         SBLASTMBUFCHK(sb, __func__);
1265         SBLASTRECORDCHK(sb, "sbappendadddr 2");
1266
1267         postevent(0, sb, EV_RWBYTES);
1268         return 1;
1269 }
1270
1271 /*
1272  * Returns:     0                       Error: No space/out of mbufs/etc.
1273  *              1                       Success
1274  *
1275  * Imputed:     (*error_out)            errno for error
1276  *              ENOBUFS
1277  *      sflt_data_in:???                [whatever a filter author chooses]
1278  */
1279 int
1280 sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
1281     struct mbuf *control, int *error_out)
1282 {
1283         int result = 0;
1284         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1285         struct mbuf *mbuf_chain = NULL;
1286
1287         if (error_out) {
1288                 *error_out = 0;
1289         }
1290
1291         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1292                 panic("sbappendaddrorfree");
1293         }
1294
1295         if (sb->sb_flags & SB_DROP) {
1296                 if (m0 != NULL) {
1297                         m_freem(m0);
1298                 }
1299                 if (control != NULL && !sb_unix) {
1300                         m_freem(control);
1301                 }
1302                 if (error_out != NULL) {
1303                         *error_out = EINVAL;
1304                 }
1305                 return 0;
1306         }
1307
1308         /* Call socket data in filters */
1309         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1310                 int error;
1311                 error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
1312                 SBLASTRECORDCHK(sb, __func__);
1313
1314 #if CONTENT_FILTER
1315                 if (error == 0) {
1316                         error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
1317                             0);
1318                 }
1319 #endif /* CONTENT_FILTER */
1320
1321                 if (error) {
1322                         if (error != EJUSTRETURN) {
1323                                 if (m0) {
1324                                         m_freem(m0);
1325                                 }
1326                                 if (control != NULL && !sb_unix) {
1327                                         m_freem(control);
1328                                 }
1329                                 if (error_out) {
1330                                         *error_out = error;
1331                                 }
1332                         }
1333                         return 0;
1334                 }
1335         } else if (m0) {
1336                 m0->m_flags &= ~M_SKIPCFIL;
1337         }
1338
1339         mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
1340         SBLASTRECORDCHK(sb, "sbappendadddr 1");
1341         result = sbappendchain(sb, mbuf_chain, 0);
1342         if (result == 0) {
1343                 if (m0) {
1344                         m_freem(m0);
1345                 }
1346                 if (control != NULL && !sb_unix) {
1347                         m_freem(control);
1348                 }
1349                 if (error_out) {
1350                         *error_out = ENOBUFS;
1351                 }
1352         }
1353
1354         return result;
1355 }
1356
1357 inline boolean_t
1358 is_cmsg_valid(struct mbuf *control, struct cmsghdr *cmsg)
1359 {
1360         if (cmsg == NULL) {
1361                 return FALSE;
1362         }
1363
1364         if (cmsg->cmsg_len < sizeof(struct cmsghdr)) {
1365                 return FALSE;
1366         }
1367
1368         if ((uint8_t *)control->m_data >= (uint8_t *)cmsg + cmsg->cmsg_len) {
1369                 return FALSE;
1370         }
1371
1372         if ((uint8_t *)control->m_data + control->m_len <
1373             (uint8_t *)cmsg + cmsg->cmsg_len) {
1374                 return FALSE;
1375         }
1376
1377         return TRUE;
1378 }
1379
1380 static int
1381 sbappendcontrol_internal(struct sockbuf *sb, struct mbuf *m0,
1382     struct mbuf *control)
1383 {
1384         struct mbuf *m, *mlast, *n;
1385         int space = 0;
1386
1387         if (control == 0) {
1388                 panic("sbappendcontrol");
1389         }
1390
1391         for (m = control;; m = m->m_next) {
1392                 space += m->m_len;
1393                 if (m->m_next == 0) {
1394                         break;
1395                 }
1396         }
1397         n = m;                  /* save pointer to last control buffer */
1398         for (m = m0; m; m = m->m_next) {
1399                 space += m->m_len;
1400         }
1401         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1402                 return 0;
1403         }
1404         n->m_next = m0;                 /* concatenate data to control */
1405         SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1406
1407         for (m = control; m->m_next != NULL; m = m->m_next) {
1408                 sballoc(sb, m);
1409         }
1410         sballoc(sb, m);
1411         mlast = m;
1412
1413         if (sb->sb_lastrecord != NULL) {
1414                 sb->sb_lastrecord->m_nextpkt = control;
1415         } else {
1416                 sb->sb_mb = control;
1417         }
1418         sb->sb_lastrecord = control;
1419         sb->sb_mbtail = mlast;
1420
1421         SBLASTMBUFCHK(sb, __func__);
1422         SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1423
1424         postevent(0, sb, EV_RWBYTES);
1425         return 1;
1426 }
1427
1428 int
1429 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
1430     int *error_out)
1431 {
1432         int result = 0;
1433         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1434
1435         if (error_out) {
1436                 *error_out = 0;
1437         }
1438
1439         if (sb->sb_flags & SB_DROP) {
1440                 if (m0 != NULL) {
1441                         m_freem(m0);
1442                 }
1443                 if (control != NULL && !sb_unix) {
1444                         m_freem(control);
1445                 }
1446                 if (error_out != NULL) {
1447                         *error_out = EINVAL;
1448                 }
1449                 return 0;
1450         }
1451
1452         if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1453                 int error;
1454
1455                 error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
1456                 SBLASTRECORDCHK(sb, __func__);
1457
1458 #if CONTENT_FILTER
1459                 if (error == 0) {
1460                         error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
1461                             0);
1462                 }
1463 #endif /* CONTENT_FILTER */
1464
1465                 if (error) {
1466                         if (error != EJUSTRETURN) {
1467                                 if (m0) {
1468                                         m_freem(m0);
1469                                 }
1470                                 if (control != NULL && !sb_unix) {
1471                                         m_freem(control);
1472                                 }
1473                                 if (error_out) {
1474                                         *error_out = error;
1475                                 }
1476                         }
1477                         return 0;
1478                 }
1479         } else if (m0) {
1480                 m0->m_flags &= ~M_SKIPCFIL;
1481         }
1482
1483         result = sbappendcontrol_internal(sb, m0, control);
1484         if (result == 0) {
1485                 if (m0) {
1486                         m_freem(m0);
1487                 }
1488                 if (control != NULL && !sb_unix) {
1489                         m_freem(control);
1490                 }
1491                 if (error_out) {
1492                         *error_out = ENOBUFS;
1493                 }
1494         }
1495
1496         return result;
1497 }
1498
1499 /*
1500  * Append a contiguous TCP data blob with TCP sequence number as control data
1501  * as a new msg to the receive socket buffer.
1502  */
1503 int
1504 sbappendmsgstream_rcv(struct sockbuf *sb, struct mbuf *m, uint32_t seqnum,
1505     int unordered)
1506 {
1507         struct mbuf *m_eor = NULL;
1508         u_int32_t data_len = 0;
1509         int ret = 0;
1510         struct socket *so = sb->sb_so;
1511
1512         if (m == NULL) {
1513                 return 0;
1514         }
1515
1516         VERIFY((m->m_flags & M_PKTHDR) && m_pktlen(m) > 0);
1517         VERIFY(so->so_msg_state != NULL);
1518         VERIFY(sb->sb_flags & SB_RECV);
1519
1520         /* Keep the TCP sequence number in the mbuf pkthdr */
1521         m->m_pkthdr.msg_seq = seqnum;
1522
1523         /* find last mbuf and set M_EOR */
1524         for (m_eor = m;; m_eor = m_eor->m_next) {
1525                 /*
1526                  * If the msg is unordered, we need to account for
1527                  * these bytes in receive socket buffer size. Otherwise,
1528                  * the receive window advertised will shrink because
1529                  * of the additional unordered bytes added to the
1530                  * receive buffer.
1531                  */
1532                 if (unordered) {
1533                         m_eor->m_flags |= M_UNORDERED_DATA;
1534                         data_len += m_eor->m_len;
1535                         so->so_msg_state->msg_uno_bytes += m_eor->m_len;
1536                 } else {
1537                         m_eor->m_flags &= ~M_UNORDERED_DATA;
1538                 }
1539                 if (m_eor->m_next == NULL) {
1540                         break;
1541                 }
1542         }
1543
1544         /* set EOR flag at end of byte blob */
1545         m_eor->m_flags |= M_EOR;
1546
1547         /* expand the receive socket buffer to allow unordered data */
1548         if (unordered && !sbreserve(sb, sb->sb_hiwat + data_len)) {
1549                 /*
1550                  * Could not allocate memory for unordered data, it
1551                  * means this packet will have to be delivered in order
1552                  */
1553                 printf("%s: could not reserve space for unordered data\n",
1554                     __func__);
1555         }
1556
1557         if (!unordered && (sb->sb_mbtail != NULL) &&
1558             !(sb->sb_mbtail->m_flags & M_UNORDERED_DATA)) {
1559                 sb->sb_mbtail->m_flags &= ~M_EOR;
1560                 sbcompress(sb, m, sb->sb_mbtail);
1561                 ret = 1;
1562         } else {
1563                 ret = sbappendrecord(sb, m);
1564         }
1565         VERIFY(sb->sb_mbtail->m_flags & M_EOR);
1566         return ret;
1567 }
1568
1569 /*
1570  * TCP streams have message based out of order delivery support, or have
1571  * Multipath TCP support, or are regular TCP sockets
1572  */
1573 int
1574 sbappendstream_rcvdemux(struct socket *so, struct mbuf *m, uint32_t seqnum,
1575     int unordered)
1576 {
1577         int ret = 0;
1578
1579         if ((m != NULL) &&
1580             m_pktlen(m) <= 0 &&
1581             !((so->so_flags & SOF_MP_SUBFLOW) &&
1582             (m->m_flags & M_PKTHDR) &&
1583             (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1584                 m_freem(m);
1585                 return ret;
1586         }
1587
1588         if (so->so_flags & SOF_ENABLE_MSGS) {
1589                 ret = sbappendmsgstream_rcv(&so->so_rcv, m, seqnum, unordered);
1590         }
1591 #if MPTCP
1592         else if (so->so_flags & SOF_MP_SUBFLOW) {
1593                 ret = sbappendmptcpstream_rcv(&so->so_rcv, m);
1594         }
1595 #endif /* MPTCP */
1596         else {
1597                 ret = sbappendstream(&so->so_rcv, m);
1598         }
1599         return ret;
1600 }
1601
1602 #if MPTCP
1603 int
1604 sbappendmptcpstream_rcv(struct sockbuf *sb, struct mbuf *m)
1605 {
1606         struct socket *so = sb->sb_so;
1607
1608         VERIFY(m == NULL || (m->m_flags & M_PKTHDR));
1609         /* SB_NOCOMPRESS must be set prevent loss of M_PKTHDR data */
1610         VERIFY((sb->sb_flags & (SB_RECV | SB_NOCOMPRESS)) ==
1611             (SB_RECV | SB_NOCOMPRESS));
1612
1613         if (m == NULL || m_pktlen(m) == 0 || (sb->sb_flags & SB_DROP) ||
1614             (so->so_state & SS_CANTRCVMORE)) {
1615                 if (m && (m->m_flags & M_PKTHDR) &&
1616                     m_pktlen(m) == 0 &&
1617                     (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
1618                         mptcp_input(tptomptp(sototcpcb(so))->mpt_mpte, m);
1619                         return 1;
1620                 } else if (m != NULL) {
1621                         m_freem(m);
1622                 }
1623                 return 0;
1624         }
1625         /* the socket is not closed, so SOF_MP_SUBFLOW must be set */
1626         VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1627
1628         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1629                 panic("%s: nexpkt %p || mb %p != lastrecord %p\n", __func__,
1630                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1631                 /* NOTREACHED */
1632         }
1633
1634         SBLASTMBUFCHK(sb, __func__);
1635
1636         /* No filter support (SB_RECV) on mptcp subflow sockets */
1637
1638         sbcompress(sb, m, sb->sb_mbtail);
1639         sb->sb_lastrecord = sb->sb_mb;
1640         SBLASTRECORDCHK(sb, __func__);
1641         return 1;
1642 }
1643 #endif /* MPTCP */
1644
1645 /*
1646  * Append message to send socket buffer based on priority.
1647  */
1648 int
1649 sbappendmsg_snd(struct sockbuf *sb, struct mbuf *m)
1650 {
1651         struct socket *so = sb->sb_so;
1652         struct msg_priq *priq;
1653         int set_eor = 0;
1654
1655         VERIFY(so->so_msg_state != NULL);
1656
1657         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1658                 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
1659                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1660         }
1661
1662         SBLASTMBUFCHK(sb, __func__);
1663
1664         if (m == NULL || (sb->sb_flags & SB_DROP) || so->so_msg_state == NULL) {
1665                 if (m != NULL) {
1666                         m_freem(m);
1667                 }
1668                 return 0;
1669         }
1670
1671         priq = &so->so_msg_state->msg_priq[m->m_pkthdr.msg_pri];
1672
1673         /* note if we need to propogate M_EOR to the last mbuf */
1674         if (m->m_flags & M_EOR) {
1675                 set_eor = 1;
1676
1677                 /* Reset M_EOR from the first mbuf */
1678                 m->m_flags &= ~(M_EOR);
1679         }
1680
1681         if (priq->msgq_head == NULL) {
1682                 VERIFY(priq->msgq_tail == NULL && priq->msgq_lastmsg == NULL);
1683                 priq->msgq_head = priq->msgq_lastmsg = m;
1684         } else {
1685                 VERIFY(priq->msgq_tail->m_next == NULL);
1686
1687                 /* Check if the last message has M_EOR flag set */
1688                 if (priq->msgq_tail->m_flags & M_EOR) {
1689                         /* Insert as a new message */
1690                         priq->msgq_lastmsg->m_nextpkt = m;
1691
1692                         /* move the lastmsg pointer */
1693                         priq->msgq_lastmsg = m;
1694                 } else {
1695                         /* Append to the existing message */
1696                         priq->msgq_tail->m_next = m;
1697                 }
1698         }
1699
1700         /* Update accounting and the queue tail pointer */
1701
1702         while (m->m_next != NULL) {
1703                 sballoc(sb, m);
1704                 priq->msgq_bytes += m->m_len;
1705                 m = m->m_next;
1706         }
1707         sballoc(sb, m);
1708         priq->msgq_bytes += m->m_len;
1709
1710         if (set_eor) {
1711                 m->m_flags |= M_EOR;
1712
1713                 /*
1714                  * Since the user space can not write a new msg
1715                  * without completing the previous one, we can
1716                  * reset this flag to start sending again.
1717                  */
1718                 priq->msgq_flags &= ~(MSGQ_MSG_NOTDONE);
1719         }
1720
1721         priq->msgq_tail = m;
1722
1723         SBLASTRECORDCHK(sb, "sbappendstream 2");
1724         postevent(0, sb, EV_RWBYTES);
1725         return 1;
1726 }
1727
1728 /*
1729  * Pull data from priority queues to the serial snd queue
1730  * right before sending.
1731  */
1732 void
1733 sbpull_unordered_data(struct socket *so, int32_t off, int32_t len)
1734 {
1735         int32_t topull, i;
1736         struct msg_priq *priq = NULL;
1737
1738         VERIFY(so->so_msg_state != NULL);
1739
1740         topull = (off + len) - so->so_msg_state->msg_serial_bytes;
1741
1742         i = MSG_PRI_MAX;
1743         while (i >= MSG_PRI_MIN && topull > 0) {
1744                 struct mbuf *m = NULL, *mqhead = NULL, *mend = NULL;
1745                 priq = &so->so_msg_state->msg_priq[i];
1746                 if ((priq->msgq_flags & MSGQ_MSG_NOTDONE) &&
1747                     priq->msgq_head == NULL) {
1748                         /*
1749                          * We were in the middle of sending
1750                          * a message and we have not seen the
1751                          * end of it.
1752                          */
1753                         VERIFY(priq->msgq_lastmsg == NULL &&
1754                             priq->msgq_tail == NULL);
1755                         return;
1756                 }
1757                 if (priq->msgq_head != NULL) {
1758                         int32_t bytes = 0, topull_tmp = topull;
1759                         /*
1760                          * We found a msg while scanning the priority
1761                          * queue from high to low priority.
1762                          */
1763                         m = priq->msgq_head;
1764                         mqhead = m;
1765                         mend = m;
1766
1767                         /*
1768                          * Move bytes from the priority queue to the
1769                          * serial queue. Compute the number of bytes
1770                          * being added.
1771                          */
1772                         while (mqhead->m_next != NULL && topull_tmp > 0) {
1773                                 bytes += mqhead->m_len;
1774                                 topull_tmp -= mqhead->m_len;
1775                                 mend = mqhead;
1776                                 mqhead = mqhead->m_next;
1777                         }
1778
1779                         if (mqhead->m_next == NULL) {
1780                                 /*
1781                                  * If we have only one more mbuf left,
1782                                  * move the last mbuf of this message to
1783                                  * serial queue and set the head of the
1784                                  * queue to be the next message.
1785                                  */
1786                                 bytes += mqhead->m_len;
1787                                 mend = mqhead;
1788                                 mqhead = m->m_nextpkt;
1789                                 if (!(mend->m_flags & M_EOR)) {
1790                                         /*
1791                                          * We have not seen the end of
1792                                          * this message, so we can not
1793                                          * pull anymore.
1794                                          */
1795                                         priq->msgq_flags |= MSGQ_MSG_NOTDONE;
1796                                 } else {
1797                                         /* Reset M_EOR */
1798                                         mend->m_flags &= ~(M_EOR);
1799                                 }
1800                         } else {
1801                                 /* propogate the next msg pointer */
1802                                 mqhead->m_nextpkt = m->m_nextpkt;
1803                         }
1804                         priq->msgq_head = mqhead;
1805
1806                         /*
1807                          * if the lastmsg pointer points to
1808                          * the mbuf that is being dequeued, update
1809                          * it to point to the new head.
1810                          */
1811                         if (priq->msgq_lastmsg == m) {
1812                                 priq->msgq_lastmsg = priq->msgq_head;
1813                         }
1814
1815                         m->m_nextpkt = NULL;
1816                         mend->m_next = NULL;
1817
1818                         if (priq->msgq_head == NULL) {
1819                                 /* Moved all messages, update tail */
1820                                 priq->msgq_tail = NULL;
1821                                 VERIFY(priq->msgq_lastmsg == NULL);
1822                         }
1823
1824                         /* Move it to serial sb_mb queue */
1825                         if (so->so_snd.sb_mb == NULL) {
1826                                 so->so_snd.sb_mb = m;
1827                         } else {
1828                                 so->so_snd.sb_mbtail->m_next = m;
1829                         }
1830
1831                         priq->msgq_bytes -= bytes;
1832                         VERIFY(priq->msgq_bytes >= 0);
1833                         sbwakeup(&so->so_snd);
1834
1835                         so->so_msg_state->msg_serial_bytes += bytes;
1836                         so->so_snd.sb_mbtail = mend;
1837                         so->so_snd.sb_lastrecord = so->so_snd.sb_mb;
1838
1839                         topull =
1840                             (off + len) - so->so_msg_state->msg_serial_bytes;
1841
1842                         if (priq->msgq_flags & MSGQ_MSG_NOTDONE) {
1843                                 break;
1844                         }
1845                 } else {
1846                         --i;
1847                 }
1848         }
1849         sblastrecordchk(&so->so_snd, "sbpull_unordered_data");
1850         sblastmbufchk(&so->so_snd, "sbpull_unordered_data");
1851 }
1852
1853 /*
1854  * Compress mbuf chain m into the socket
1855  * buffer sb following mbuf n.  If n
1856  * is null, the buffer is presumed empty.
1857  */
1858 static inline void
1859 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1860 {
1861         int eor = 0, compress = (!(sb->sb_flags & SB_NOCOMPRESS));
1862         struct mbuf *o;
1863
1864         if (m == NULL) {
1865                 /* There is nothing to compress; just update the tail */
1866                 for (; n->m_next != NULL; n = n->m_next) {
1867                         ;
1868                 }
1869                 sb->sb_mbtail = n;
1870                 goto done;
1871         }
1872
1873         while (m != NULL) {
1874                 eor |= m->m_flags & M_EOR;
1875                 if (compress && m->m_len == 0 && (eor == 0 ||
1876                     (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) {
1877                         if (sb->sb_lastrecord == m) {
1878                                 sb->sb_lastrecord = m->m_next;
1879                         }
1880                         m = m_free(m);
1881                         continue;
1882                 }
1883                 if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
1884 #ifndef __APPLE__
1885                     M_WRITABLE(n) &&
1886 #endif
1887                     m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1888                     m->m_len <= M_TRAILINGSPACE(n) &&
1889                     n->m_type == m->m_type) {
1890                         bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1891                             (unsigned)m->m_len);
1892                         n->m_len += m->m_len;
1893                         sb->sb_cc += m->m_len;
1894                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1895                             m->m_type != MT_OOBDATA) {
1896                                 /* XXX: Probably don't need */
1897                                 sb->sb_ctl += m->m_len;
1898                         }
1899
1900                         /* update send byte count */
1901                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
1902                                 inp_incr_sndbytes_total(sb->sb_so,
1903                                     m->m_len);
1904                                 inp_incr_sndbytes_unsent(sb->sb_so,
1905                                     m->m_len);
1906                         }
1907                         m = m_free(m);
1908                         continue;
1909                 }
1910                 if (n != NULL) {
1911                         n->m_next = m;
1912                 } else {
1913                         sb->sb_mb = m;
1914                 }
1915                 sb->sb_mbtail = m;
1916                 sballoc(sb, m);
1917                 n = m;
1918                 m->m_flags &= ~M_EOR;
1919                 m = m->m_next;
1920                 n->m_next = NULL;
1921         }
1922         if (eor != 0) {
1923                 if (n != NULL) {
1924                         n->m_flags |= eor;
1925                 } else {
1926                         printf("semi-panic: sbcompress\n");
1927                 }
1928         }
1929 done:
1930         SBLASTMBUFCHK(sb, __func__);
1931         postevent(0, sb, EV_RWBYTES);
1932 }
1933
1934 void
1935 sb_empty_assert(struct sockbuf *sb, const char *where)
1936 {
1937         if (!(sb->sb_cc == 0 && sb->sb_mb == NULL && sb->sb_mbcnt == 0 &&
1938             sb->sb_mbtail == NULL && sb->sb_lastrecord == NULL)) {
1939                 panic("%s: sb %p so %p cc %d mbcnt %d mb %p mbtail %p "
1940                     "lastrecord %p\n", where, sb, sb->sb_so, sb->sb_cc,
1941                     sb->sb_mbcnt, sb->sb_mb, sb->sb_mbtail,
1942                     sb->sb_lastrecord);
1943                 /* NOTREACHED */
1944         }
1945 }
1946
1947 static void
1948 sbflush_priq(struct msg_priq *priq)
1949 {
1950         struct mbuf *m;
1951         m = priq->msgq_head;
1952         if (m != NULL) {
1953                 m_freem_list(m);
1954         }
1955         priq->msgq_head = priq->msgq_tail = priq->msgq_lastmsg = NULL;
1956         priq->msgq_bytes = priq->msgq_flags = 0;
1957 }
1958
1959 /*
1960  * Free all mbufs in a sockbuf.
1961  * Check that all resources are reclaimed.
1962  */
1963 void
1964 sbflush(struct sockbuf *sb)
1965 {
1966         void *lr_saved = __builtin_return_address(0);
1967         struct socket *so = sb->sb_so;
1968         u_int32_t i;
1969
1970         /* so_usecount may be 0 if we get here from sofreelastref() */
1971         if (so == NULL) {
1972                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
1973                     __func__, sb, sb->sb_flags, lr_saved);
1974                 /* NOTREACHED */
1975         } else if (so->so_usecount < 0) {
1976                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
1977                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
1978                     so->so_usecount, lr_saved, solockhistory_nr(so));
1979                 /* NOTREACHED */
1980         }
1981
1982         /*
1983          * Obtain lock on the socket buffer (SB_LOCK).  This is required
1984          * to prevent the socket buffer from being unexpectedly altered
1985          * while it is used by another thread in socket send/receive.
1986          *
1987          * sblock() must not fail here, hence the assertion.
1988          */
1989         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
1990         VERIFY(sb->sb_flags & SB_LOCK);
1991
1992         while (sb->sb_mbcnt > 0) {
1993                 /*
1994                  * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1995                  * we would loop forever. Panic instead.
1996                  */
1997                 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) {
1998                         break;
1999                 }
2000                 sbdrop(sb, (int)sb->sb_cc);
2001         }
2002
2003         if (!(sb->sb_flags & SB_RECV) && (so->so_flags & SOF_ENABLE_MSGS)) {
2004                 VERIFY(so->so_msg_state != NULL);
2005                 for (i = MSG_PRI_MIN; i <= MSG_PRI_MAX; ++i) {
2006                         sbflush_priq(&so->so_msg_state->msg_priq[i]);
2007                 }
2008                 so->so_msg_state->msg_serial_bytes = 0;
2009                 so->so_msg_state->msg_uno_bytes = 0;
2010         }
2011
2012         sb_empty_assert(sb, __func__);
2013         postevent(0, sb, EV_RWBYTES);
2014
2015         sbunlock(sb, TRUE);     /* keep socket locked */
2016 }
2017
2018 /*
2019  * Drop data from (the front of) a sockbuf.
2020  * use m_freem_list to free the mbuf structures
2021  * under a single lock... this is done by pruning
2022  * the top of the tree from the body by keeping track
2023  * of where we get to in the tree and then zeroing the
2024  * two pertinent pointers m_nextpkt and m_next
2025  * the socket buffer is then updated to point at the new
2026  * top of the tree and the pruned area is released via
2027  * m_freem_list.
2028  */
2029 void
2030 sbdrop(struct sockbuf *sb, int len)
2031 {
2032         struct mbuf *m, *free_list, *ml;
2033         struct mbuf *next, *last;
2034
2035         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
2036 #if MPTCP
2037         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
2038             ((sb->sb_so->so_flags & SOF_MP_SUBFLOW) ||
2039             (SOCK_CHECK_DOM(sb->sb_so, PF_MULTIPATH) &&
2040             SOCK_CHECK_PROTO(sb->sb_so, IPPROTO_TCP))) &&
2041             !(sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
2042                 mptcp_preproc_sbdrop(sb->sb_so, m, (unsigned int)len);
2043         }
2044         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
2045             (sb->sb_so->so_flags & SOF_MP_SUBFLOW) &&
2046             (sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
2047                 mptcp_fallback_sbdrop(sb->sb_so, m, len);
2048         }
2049 #endif /* MPTCP */
2050         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
2051
2052         free_list = last = m;
2053         ml = (struct mbuf *)0;
2054
2055         while (len > 0) {
2056                 if (m == NULL) {
2057                         if (next == NULL) {
2058                                 /*
2059                                  * temporarily replacing this panic with printf
2060                                  * because it occurs occasionally when closing
2061                                  * a socket when there is no harm in ignoring
2062                                  * it. This problem will be investigated
2063                                  * further.
2064                                  */
2065                                 /* panic("sbdrop"); */
2066                                 printf("sbdrop - count not zero\n");
2067                                 len = 0;
2068                                 /*
2069                                  * zero the counts. if we have no mbufs,
2070                                  * we have no data (PR-2986815)
2071                                  */
2072                                 sb->sb_cc = 0;
2073                                 sb->sb_mbcnt = 0;
2074                                 if (!(sb->sb_flags & SB_RECV) &&
2075                                     (sb->sb_so->so_flags & SOF_ENABLE_MSGS)) {
2076                                         sb->sb_so->so_msg_state->
2077                                         msg_serial_bytes = 0;
2078                                 }
2079                                 break;
2080                         }
2081                         m = last = next;
2082                         next = m->m_nextpkt;
2083                         continue;
2084                 }
2085                 if (m->m_len > len) {
2086                         m->m_len -= len;
2087                         m->m_data += len;
2088                         sb->sb_cc -= len;
2089                         /* update the send byte count */
2090                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2091                                 inp_decr_sndbytes_total(sb->sb_so, len);
2092                         }
2093                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2094                             m->m_type != MT_OOBDATA) {
2095                                 sb->sb_ctl -= len;
2096                         }
2097                         break;
2098                 }
2099                 len -= m->m_len;
2100                 sbfree(sb, m);
2101
2102                 ml = m;
2103                 m = m->m_next;
2104         }
2105         while (m && m->m_len == 0) {
2106                 sbfree(sb, m);
2107
2108                 ml = m;
2109                 m = m->m_next;
2110         }
2111         if (ml) {
2112                 ml->m_next = (struct mbuf *)0;
2113                 last->m_nextpkt = (struct mbuf *)0;
2114                 m_freem_list(free_list);
2115         }
2116         if (m) {
2117                 sb->sb_mb = m;
2118                 m->m_nextpkt = next;
2119         } else {
2120                 sb->sb_mb = next;
2121         }
2122
2123         /*
2124          * First part is an inline SB_EMPTY_FIXUP().  Second part
2125          * makes sure sb_lastrecord is up-to-date if we dropped
2126          * part of the last record.
2127          */
2128         m = sb->sb_mb;
2129         if (m == NULL) {
2130                 sb->sb_mbtail = NULL;
2131                 sb->sb_lastrecord = NULL;
2132         } else if (m->m_nextpkt == NULL) {
2133                 sb->sb_lastrecord = m;
2134         }
2135
2136 #if CONTENT_FILTER
2137         cfil_sock_buf_update(sb);
2138 #endif /* CONTENT_FILTER */
2139
2140         postevent(0, sb, EV_RWBYTES);
2141
2142         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
2143 }
2144
2145 /*
2146  * Drop a record off the front of a sockbuf
2147  * and move the next record to the front.
2148  */
2149 void
2150 sbdroprecord(struct sockbuf *sb)
2151 {
2152         struct mbuf *m, *mn;
2153
2154         m = sb->sb_mb;
2155         if (m) {
2156                 sb->sb_mb = m->m_nextpkt;
2157                 do {
2158                         sbfree(sb, m);
2159                         MFREE(m, mn);
2160                         m = mn;
2161                 } while (m);
2162         }
2163         SB_EMPTY_FIXUP(sb);
2164         postevent(0, sb, EV_RWBYTES);
2165 }
2166
2167 /*
2168  * Create a "control" mbuf containing the specified data
2169  * with the specified type for presentation on a socket buffer.
2170  */
2171 struct mbuf *
2172 sbcreatecontrol(caddr_t p, int size, int type, int level)
2173 {
2174         struct cmsghdr *cp;
2175         struct mbuf *m;
2176
2177         if (CMSG_SPACE((u_int)size) > MLEN) {
2178                 return (struct mbuf *)NULL;
2179         }
2180         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) {
2181                 return (struct mbuf *)NULL;
2182         }
2183         cp = mtod(m, struct cmsghdr *);
2184         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
2185         /* XXX check size? */
2186         (void) memcpy(CMSG_DATA(cp), p, size);
2187         m->m_len = CMSG_SPACE(size);
2188         cp->cmsg_len = CMSG_LEN(size);
2189         cp->cmsg_level = level;
2190         cp->cmsg_type = type;
2191         return m;
2192 }
2193
2194 struct mbuf **
2195 sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf **mp)
2196 {
2197         struct mbuf *m;
2198         struct cmsghdr *cp;
2199
2200         if (*mp == NULL) {
2201                 *mp = sbcreatecontrol(p, size, type, level);
2202                 return mp;
2203         }
2204
2205         if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN) {
2206                 mp = &(*mp)->m_next;
2207                 *mp = sbcreatecontrol(p, size, type, level);
2208                 return mp;
2209         }
2210
2211         m = *mp;
2212
2213         cp = (struct cmsghdr *)(void *)(mtod(m, char *) + m->m_len);
2214         /* CMSG_SPACE ensures 32-bit alignment */
2215         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
2216         m->m_len += CMSG_SPACE(size);
2217
2218         /* XXX check size? */
2219         (void) memcpy(CMSG_DATA(cp), p, size);
2220         cp->cmsg_len = CMSG_LEN(size);
2221         cp->cmsg_level = level;
2222         cp->cmsg_type = type;
2223
2224         return mp;
2225 }
2226
2227
2228 /*
2229  * Some routines that return EOPNOTSUPP for entry points that are not
2230  * supported by a protocol.  Fill in as needed.
2231  */
2232 int
2233 pru_abort_notsupp(struct socket *so)
2234 {
2235 #pragma unused(so)
2236         return EOPNOTSUPP;
2237 }
2238
2239 int
2240 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2241 {
2242 #pragma unused(so, nam)
2243         return EOPNOTSUPP;
2244 }
2245
2246 int
2247 pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
2248 {
2249 #pragma unused(so, proto, p)
2250         return EOPNOTSUPP;
2251 }
2252
2253 int
2254 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2255 {
2256 #pragma unused(so, nam, p)
2257         return EOPNOTSUPP;
2258 }
2259
2260 int
2261 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2262 {
2263 #pragma unused(so, nam, p)
2264         return EOPNOTSUPP;
2265 }
2266
2267 int
2268 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2269 {
2270 #pragma unused(so1, so2)
2271         return EOPNOTSUPP;
2272 }
2273
2274 int
2275 pru_connectx_notsupp(struct socket *so, struct sockaddr *src,
2276     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
2277     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
2278     uint32_t arglen, struct uio *uio, user_ssize_t *bytes_written)
2279 {
2280 #pragma unused(so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written)
2281         return EOPNOTSUPP;
2282 }
2283
2284 int
2285 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2286     struct ifnet *ifp, struct proc *p)
2287 {
2288 #pragma unused(so, cmd, data, ifp, p)
2289         return EOPNOTSUPP;
2290 }
2291
2292 int
2293 pru_detach_notsupp(struct socket *so)
2294 {
2295 #pragma unused(so)
2296         return EOPNOTSUPP;
2297 }
2298
2299 int
2300 pru_disconnect_notsupp(struct socket *so)
2301 {
2302 #pragma unused(so)
2303         return EOPNOTSUPP;
2304 }
2305
2306 int
2307 pru_disconnectx_notsupp(struct socket *so, sae_associd_t aid, sae_connid_t cid)
2308 {
2309 #pragma unused(so, aid, cid)
2310         return EOPNOTSUPP;
2311 }
2312
2313 int
2314 pru_listen_notsupp(struct socket *so, struct proc *p)
2315 {
2316 #pragma unused(so, p)
2317         return EOPNOTSUPP;
2318 }
2319
2320 int
2321 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2322 {
2323 #pragma unused(so, nam)
2324         return EOPNOTSUPP;
2325 }
2326
2327 int
2328 pru_rcvd_notsupp(struct socket *so, int flags)
2329 {
2330 #pragma unused(so, flags)
2331         return EOPNOTSUPP;
2332 }
2333
2334 int
2335 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2336 {
2337 #pragma unused(so, m, flags)
2338         return EOPNOTSUPP;
2339 }
2340
2341 int
2342 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2343     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2344 {
2345 #pragma unused(so, flags, m, addr, control, p)
2346         return EOPNOTSUPP;
2347 }
2348
2349 int
2350 pru_send_list_notsupp(struct socket *so, int flags, struct mbuf *m,
2351     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2352 {
2353 #pragma unused(so, flags, m, addr, control, p)
2354         return EOPNOTSUPP;
2355 }
2356
2357 /*
2358  * This isn't really a ``null'' operation, but it's the default one
2359  * and doesn't do anything destructive.
2360  */
2361 int
2362 pru_sense_null(struct socket *so, void *ub, int isstat64)
2363 {
2364         if (isstat64 != 0) {
2365                 struct stat64 *sb64;
2366
2367                 sb64 = (struct stat64 *)ub;
2368                 sb64->st_blksize = so->so_snd.sb_hiwat;
2369         } else {
2370                 struct stat *sb;
2371
2372                 sb = (struct stat *)ub;
2373                 sb->st_blksize = so->so_snd.sb_hiwat;
2374         }
2375
2376         return 0;
2377 }
2378
2379
2380 int
2381 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2382     struct mbuf *top, struct mbuf *control, int flags)
2383 {
2384 #pragma unused(so, addr, uio, top, control, flags)
2385         return EOPNOTSUPP;
2386 }
2387
2388 int
2389 pru_sosend_list_notsupp(struct socket *so, struct uio **uio,
2390     u_int uiocnt, int flags)
2391 {
2392 #pragma unused(so, uio, uiocnt, flags)
2393         return EOPNOTSUPP;
2394 }
2395
2396 int
2397 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2398     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2399 {
2400 #pragma unused(so, paddr, uio, mp0, controlp, flagsp)
2401         return EOPNOTSUPP;
2402 }
2403
2404 int
2405 pru_soreceive_list_notsupp(struct socket *so,
2406     struct recv_msg_elem *recv_msg_array, u_int uiocnt, int *flagsp)
2407 {
2408 #pragma unused(so, recv_msg_array, uiocnt, flagsp)
2409         return EOPNOTSUPP;
2410 }
2411
2412 int
2413 pru_shutdown_notsupp(struct socket *so)
2414 {
2415 #pragma unused(so)
2416         return EOPNOTSUPP;
2417 }
2418
2419 int
2420 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2421 {
2422 #pragma unused(so, nam)
2423         return EOPNOTSUPP;
2424 }
2425
2426 int
2427 pru_sopoll_notsupp(struct socket *so, int events, kauth_cred_t cred, void *wql)
2428 {
2429 #pragma unused(so, events, cred, wql)
2430         return EOPNOTSUPP;
2431 }
2432
2433 int
2434 pru_socheckopt_null(struct socket *so, struct sockopt *sopt)
2435 {
2436 #pragma unused(so, sopt)
2437         /*
2438          * Allow all options for set/get by default.
2439          */
2440         return 0;
2441 }
2442
2443 static int
2444 pru_preconnect_null(struct socket *so)
2445 {
2446 #pragma unused(so)
2447         return 0;
2448 }
2449
2450 void
2451 pru_sanitize(struct pr_usrreqs *pru)
2452 {
2453 #define DEFAULT(foo, bar)       if ((foo) == NULL) (foo) = (bar)
2454         DEFAULT(pru->pru_abort, pru_abort_notsupp);
2455         DEFAULT(pru->pru_accept, pru_accept_notsupp);
2456         DEFAULT(pru->pru_attach, pru_attach_notsupp);
2457         DEFAULT(pru->pru_bind, pru_bind_notsupp);
2458         DEFAULT(pru->pru_connect, pru_connect_notsupp);
2459         DEFAULT(pru->pru_connect2, pru_connect2_notsupp);
2460         DEFAULT(pru->pru_connectx, pru_connectx_notsupp);
2461         DEFAULT(pru->pru_control, pru_control_notsupp);
2462         DEFAULT(pru->pru_detach, pru_detach_notsupp);
2463         DEFAULT(pru->pru_disconnect, pru_disconnect_notsupp);
2464         DEFAULT(pru->pru_disconnectx, pru_disconnectx_notsupp);
2465         DEFAULT(pru->pru_listen, pru_listen_notsupp);
2466         DEFAULT(pru->pru_peeraddr, pru_peeraddr_notsupp);
2467         DEFAULT(pru->pru_rcvd, pru_rcvd_notsupp);
2468         DEFAULT(pru->pru_rcvoob, pru_rcvoob_notsupp);
2469         DEFAULT(pru->pru_send, pru_send_notsupp);
2470         DEFAULT(pru->pru_send_list, pru_send_list_notsupp);
2471         DEFAULT(pru->pru_sense, pru_sense_null);
2472         DEFAULT(pru->pru_shutdown, pru_shutdown_notsupp);
2473         DEFAULT(pru->pru_sockaddr, pru_sockaddr_notsupp);
2474         DEFAULT(pru->pru_sopoll, pru_sopoll_notsupp);
2475         DEFAULT(pru->pru_soreceive, pru_soreceive_notsupp);
2476         DEFAULT(pru->pru_soreceive_list, pru_soreceive_list_notsupp);
2477         DEFAULT(pru->pru_sosend, pru_sosend_notsupp);
2478         DEFAULT(pru->pru_sosend_list, pru_sosend_list_notsupp);
2479         DEFAULT(pru->pru_socheckopt, pru_socheckopt_null);
2480         DEFAULT(pru->pru_preconnect, pru_preconnect_null);
2481 #undef DEFAULT
2482 }
2483
2484 /*
2485  * The following are macros on BSD and functions on Darwin
2486  */
2487
2488 /*
2489  * Do we need to notify the other side when I/O is possible?
2490  */
2491
2492 int
2493 sb_notify(struct sockbuf *sb)
2494 {
2495         return sb->sb_waiters > 0 ||
2496                (sb->sb_flags & (SB_SEL | SB_ASYNC | SB_UPCALL | SB_KNOTE));
2497 }
2498
2499 /*
2500  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
2501  * This is problematical if the fields are unsigned, as the space might
2502  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
2503  * overflow and return 0.
2504  */
2505 int
2506 sbspace(struct sockbuf *sb)
2507 {
2508         int pending = 0;
2509         int space = imin((int)(sb->sb_hiwat - sb->sb_cc),
2510             (int)(sb->sb_mbmax - sb->sb_mbcnt));
2511
2512         if (sb->sb_preconn_hiwat != 0) {
2513                 space = imin((int)(sb->sb_preconn_hiwat - sb->sb_cc), space);
2514         }
2515
2516         if (space < 0) {
2517                 space = 0;
2518         }
2519
2520         /* Compensate for data being processed by content filters */
2521 #if CONTENT_FILTER
2522         pending = cfil_sock_data_space(sb);
2523 #endif /* CONTENT_FILTER */
2524         if (pending > space) {
2525                 space = 0;
2526         } else {
2527                 space -= pending;
2528         }
2529
2530         return space;
2531 }
2532
2533 /*
2534  * If this socket has priority queues, check if there is enough
2535  * space in the priority queue for this msg.
2536  */
2537 int
2538 msgq_sbspace(struct socket *so, struct mbuf *control)
2539 {
2540         int space = 0, error;
2541         u_int32_t msgpri = 0;
2542         VERIFY(so->so_type == SOCK_STREAM &&
2543             SOCK_PROTO(so) == IPPROTO_TCP);
2544         if (control != NULL) {
2545                 error = tcp_get_msg_priority(control, &msgpri);
2546                 if (error) {
2547                         return 0;
2548                 }
2549         } else {
2550                 msgpri = MSG_PRI_0;
2551         }
2552         space = (so->so_snd.sb_idealsize / MSG_PRI_COUNT) -
2553             so->so_msg_state->msg_priq[msgpri].msgq_bytes;
2554         if (space < 0) {
2555                 space = 0;
2556         }
2557         return space;
2558 }
2559
2560 /* do we have to send all at once on a socket? */
2561 int
2562 sosendallatonce(struct socket *so)
2563 {
2564         return so->so_proto->pr_flags & PR_ATOMIC;
2565 }
2566
2567 /* can we read something from so? */
2568 int
2569 soreadable(struct socket *so)
2570 {
2571         return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2572                ((so->so_state & SS_CANTRCVMORE)
2573 #if CONTENT_FILTER
2574                && cfil_sock_data_pending(&so->so_rcv) == 0
2575 #endif /* CONTENT_FILTER */
2576                ) ||
2577                so->so_comp.tqh_first || so->so_error;
2578 }
2579
2580 /* can we write something to so? */
2581
2582 int
2583 sowriteable(struct socket *so)
2584 {
2585         if ((so->so_state & SS_CANTSENDMORE) ||
2586             so->so_error > 0) {
2587                 return 1;
2588         }
2589         if (so_wait_for_if_feedback(so) || !socanwrite(so)) {
2590                 return 0;
2591         }
2592         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2593                 return 1;
2594         }
2595
2596         if (sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat) {
2597                 if (so->so_flags & SOF_NOTSENT_LOWAT) {
2598                         if ((SOCK_DOM(so) == PF_INET6 ||
2599                             SOCK_DOM(so) == PF_INET) &&
2600                             so->so_type == SOCK_STREAM) {
2601                                 return tcp_notsent_lowat_check(so);
2602                         }
2603 #if MPTCP
2604                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
2605                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
2606                                 return mptcp_notsent_lowat_check(so);
2607                         }
2608 #endif
2609                         else {
2610                                 return 1;
2611                         }
2612                 } else {
2613                         return 1;
2614                 }
2615         }
2616         return 0;
2617 }
2618
2619 /* adjust counters in sb reflecting allocation of m */
2620
2621 void
2622 sballoc(struct sockbuf *sb, struct mbuf *m)
2623 {
2624         u_int32_t cnt = 1;
2625         sb->sb_cc += m->m_len;
2626         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2627             m->m_type != MT_OOBDATA) {
2628                 sb->sb_ctl += m->m_len;
2629         }
2630         sb->sb_mbcnt += MSIZE;
2631
2632         if (m->m_flags & M_EXT) {
2633                 sb->sb_mbcnt += m->m_ext.ext_size;
2634                 cnt += (m->m_ext.ext_size >> MSIZESHIFT);
2635         }
2636         OSAddAtomic(cnt, &total_sbmb_cnt);
2637         VERIFY(total_sbmb_cnt > 0);
2638         if (total_sbmb_cnt > total_sbmb_cnt_peak) {
2639                 total_sbmb_cnt_peak = total_sbmb_cnt;
2640         }
2641
2642         /*
2643          * If data is being added to the send socket buffer,
2644          * update the send byte count
2645          */
2646         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2647                 inp_incr_sndbytes_total(sb->sb_so, m->m_len);
2648                 inp_incr_sndbytes_unsent(sb->sb_so, m->m_len);
2649         }
2650 }
2651
2652 /* adjust counters in sb reflecting freeing of m */
2653 void
2654 sbfree(struct sockbuf *sb, struct mbuf *m)
2655 {
2656         int cnt = -1;
2657
2658         sb->sb_cc -= m->m_len;
2659         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2660             m->m_type != MT_OOBDATA) {
2661                 sb->sb_ctl -= m->m_len;
2662         }
2663         sb->sb_mbcnt -= MSIZE;
2664         if (m->m_flags & M_EXT) {
2665                 sb->sb_mbcnt -= m->m_ext.ext_size;
2666                 cnt -= (m->m_ext.ext_size >> MSIZESHIFT);
2667         }
2668         OSAddAtomic(cnt, &total_sbmb_cnt);
2669         VERIFY(total_sbmb_cnt >= 0);
2670         if (total_sbmb_cnt < total_sbmb_cnt_floor) {
2671                 total_sbmb_cnt_floor = total_sbmb_cnt;
2672         }
2673
2674         /*
2675          * If data is being removed from the send socket buffer,
2676          * update the send byte count
2677          */
2678         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2679                 inp_decr_sndbytes_total(sb->sb_so, m->m_len);
2680         }
2681 }
2682
2683 /*
2684  * Set lock on sockbuf sb; sleep if lock is already held.
2685  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
2686  * Returns error without lock if sleep is interrupted.
2687  */
2688 int
2689 sblock(struct sockbuf *sb, uint32_t flags)
2690 {
2691         boolean_t nointr = ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR));
2692         void *lr_saved = __builtin_return_address(0);
2693         struct socket *so = sb->sb_so;
2694         void * wchan;
2695         int error = 0;
2696         thread_t tp = current_thread();
2697
2698         VERIFY((flags & SBL_VALID) == flags);
2699
2700         /* so_usecount may be 0 if we get here from sofreelastref() */
2701         if (so == NULL) {
2702                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2703                     __func__, sb, sb->sb_flags, lr_saved);
2704                 /* NOTREACHED */
2705         } else if (so->so_usecount < 0) {
2706                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2707                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2708                     so->so_usecount, lr_saved, solockhistory_nr(so));
2709                 /* NOTREACHED */
2710         }
2711
2712         /*
2713          * The content filter thread must hold the sockbuf lock
2714          */
2715         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2716                 /*
2717                  * Don't panic if we are defunct because SB_LOCK has
2718                  * been cleared by sodefunct()
2719                  */
2720                 if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK)) {
2721                         panic("%s: SB_LOCK not held for %p\n",
2722                             __func__, sb);
2723                 }
2724
2725                 /* Keep the sockbuf locked */
2726                 return 0;
2727         }
2728
2729         if ((sb->sb_flags & SB_LOCK) && !(flags & SBL_WAIT)) {
2730                 return EWOULDBLOCK;
2731         }
2732         /*
2733          * We may get here from sorflush(), in which case "sb" may not
2734          * point to the real socket buffer.  Use the actual socket buffer
2735          * address from the socket instead.
2736          */
2737         wchan = (sb->sb_flags & SB_RECV) ?
2738             &so->so_rcv.sb_flags : &so->so_snd.sb_flags;
2739
2740         /*
2741          * A content filter thread has exclusive access to the sockbuf
2742          * until it clears the
2743          */
2744         while ((sb->sb_flags & SB_LOCK) ||
2745             ((so->so_flags & SOF_CONTENT_FILTER) &&
2746             sb->sb_cfil_thread != NULL)) {
2747                 lck_mtx_t *mutex_held;
2748
2749                 /*
2750                  * XXX: This code should be moved up above outside of this loop;
2751                  * however, we may get here as part of sofreelastref(), and
2752                  * at that time pr_getlock() may no longer be able to return
2753                  * us the lock.  This will be fixed in future.
2754                  */
2755                 if (so->so_proto->pr_getlock != NULL) {
2756                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2757                 } else {
2758                         mutex_held = so->so_proto->pr_domain->dom_mtx;
2759                 }
2760
2761                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2762
2763                 sb->sb_wantlock++;
2764                 VERIFY(sb->sb_wantlock != 0);
2765
2766                 error = msleep(wchan, mutex_held,
2767                     nointr ? PSOCK : PSOCK | PCATCH,
2768                     nointr ? "sb_lock_nointr" : "sb_lock", NULL);
2769
2770                 VERIFY(sb->sb_wantlock != 0);
2771                 sb->sb_wantlock--;
2772
2773                 if (error == 0 && (so->so_flags & SOF_DEFUNCT) &&
2774                     !(flags & SBL_IGNDEFUNCT)) {
2775                         error = EBADF;
2776                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
2777                             "(%d)\n", __func__, proc_selfpid(),
2778                             proc_best_name(current_proc()),
2779                             (uint64_t)VM_KERNEL_ADDRPERM(so),
2780                             SOCK_DOM(so), SOCK_TYPE(so), error);
2781                 }
2782
2783                 if (error != 0) {
2784                         return error;
2785                 }
2786         }
2787         sb->sb_flags |= SB_LOCK;
2788         return 0;
2789 }
2790
2791 /*
2792  * Release lock on sockbuf sb
2793  */
2794 void
2795 sbunlock(struct sockbuf *sb, boolean_t keeplocked)
2796 {
2797         void *lr_saved = __builtin_return_address(0);
2798         struct socket *so = sb->sb_so;
2799         thread_t tp = current_thread();
2800
2801         /* so_usecount may be 0 if we get here from sofreelastref() */
2802         if (so == NULL) {
2803                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2804                     __func__, sb, sb->sb_flags, lr_saved);
2805                 /* NOTREACHED */
2806         } else if (so->so_usecount < 0) {
2807                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2808                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2809                     so->so_usecount, lr_saved, solockhistory_nr(so));
2810                 /* NOTREACHED */
2811         }
2812
2813         /*
2814          * The content filter thread must hold the sockbuf lock
2815          */
2816         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2817                 /*
2818                  * Don't panic if we are defunct because SB_LOCK has
2819                  * been cleared by sodefunct()
2820                  */
2821                 if (!(so->so_flags & SOF_DEFUNCT) &&
2822                     !(sb->sb_flags & SB_LOCK) &&
2823                     !(so->so_state & SS_DEFUNCT) &&
2824                     !(so->so_flags1 & SOF1_DEFUNCTINPROG)) {
2825                         panic("%s: SB_LOCK not held for %p\n",
2826                             __func__, sb);
2827                 }
2828                 /* Keep the sockbuf locked and proceed */
2829         } else {
2830                 VERIFY((sb->sb_flags & SB_LOCK) ||
2831                     (so->so_state & SS_DEFUNCT) ||
2832                     (so->so_flags1 & SOF1_DEFUNCTINPROG));
2833
2834                 sb->sb_flags &= ~SB_LOCK;
2835
2836                 if (sb->sb_wantlock > 0) {
2837                         /*
2838                          * We may get here from sorflush(), in which case "sb"
2839                          * may not point to the real socket buffer.  Use the
2840                          * actual socket buffer address from the socket instead.
2841                          */
2842                         wakeup((sb->sb_flags & SB_RECV) ? &so->so_rcv.sb_flags :
2843                             &so->so_snd.sb_flags);
2844                 }
2845         }
2846
2847         if (!keeplocked) {      /* unlock on exit */
2848                 lck_mtx_t *mutex_held;
2849
2850                 if (so->so_proto->pr_getlock != NULL) {
2851                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2852                 } else {
2853                         mutex_held = so->so_proto->pr_domain->dom_mtx;
2854                 }
2855
2856                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2857
2858                 VERIFY(so->so_usecount > 0);
2859                 so->so_usecount--;
2860                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
2861                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
2862                 lck_mtx_unlock(mutex_held);
2863         }
2864 }
2865
2866 void
2867 sorwakeup(struct socket *so)
2868 {
2869         if (sb_notify(&so->so_rcv)) {
2870                 sowakeup(so, &so->so_rcv);
2871         }
2872 }
2873
2874 void
2875 sowwakeup(struct socket *so)
2876 {
2877         if (sb_notify(&so->so_snd)) {
2878                 sowakeup(so, &so->so_snd);
2879         }
2880 }
2881
2882 void
2883 soevent(struct socket *so, long hint)
2884 {
2885         if (so->so_flags & SOF_KNOTE) {
2886                 KNOTE(&so->so_klist, hint);
2887         }
2888
2889         soevupcall(so, hint);
2890
2891         /*
2892          * Don't post an event if this a subflow socket or
2893          * the app has opted out of using cellular interface
2894          */
2895         if ((hint & SO_FILT_HINT_IFDENIED) &&
2896             !(so->so_flags & SOF_MP_SUBFLOW) &&
2897             !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) &&
2898             !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
2899                 soevent_ifdenied(so);
2900         }
2901 }
2902
2903 void
2904 soevupcall(struct socket *so, u_int32_t hint)
2905 {
2906         if (so->so_event != NULL) {
2907                 caddr_t so_eventarg = so->so_eventarg;
2908
2909                 hint &= so->so_eventmask;
2910                 if (hint != 0) {
2911                         so->so_event(so, so_eventarg, hint);
2912                 }
2913         }
2914 }
2915
2916 static void
2917 soevent_ifdenied(struct socket *so)
2918 {
2919         struct kev_netpolicy_ifdenied ev_ifdenied;
2920
2921         bzero(&ev_ifdenied, sizeof(ev_ifdenied));
2922         /*
2923          * The event consumer is interested about the effective {upid,pid,uuid}
2924          * info which can be different than the those related to the process
2925          * that recently performed a system call on the socket, i.e. when the
2926          * socket is delegated.
2927          */
2928         if (so->so_flags & SOF_DELEGATED) {
2929                 ev_ifdenied.ev_data.eupid = so->e_upid;
2930                 ev_ifdenied.ev_data.epid = so->e_pid;
2931                 uuid_copy(ev_ifdenied.ev_data.euuid, so->e_uuid);
2932         } else {
2933                 ev_ifdenied.ev_data.eupid = so->last_upid;
2934                 ev_ifdenied.ev_data.epid = so->last_pid;
2935                 uuid_copy(ev_ifdenied.ev_data.euuid, so->last_uuid);
2936         }
2937
2938         if (++so->so_ifdenied_notifies > 1) {
2939                 /*
2940                  * Allow for at most one kernel event to be generated per
2941                  * socket; so_ifdenied_notifies is reset upon changes in
2942                  * the UUID policy.  See comments in inp_update_policy.
2943                  */
2944                 if (net_io_policy_log) {
2945                         uuid_string_t buf;
2946
2947                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2948                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2949                             "euuid %s%s has %d redundant events supressed\n",
2950                             __func__, so->last_pid,
2951                             (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
2952                             SOCK_TYPE(so), ev_ifdenied.ev_data.epid, buf,
2953                             ((so->so_flags & SOF_DELEGATED) ?
2954                             " [delegated]" : ""), so->so_ifdenied_notifies);
2955                 }
2956         } else {
2957                 if (net_io_policy_log) {
2958                         uuid_string_t buf;
2959
2960                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2961                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2962                             "euuid %s%s event posted\n", __func__,
2963                             so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
2964                             SOCK_DOM(so), SOCK_TYPE(so),
2965                             ev_ifdenied.ev_data.epid, buf,
2966                             ((so->so_flags & SOF_DELEGATED) ?
2967                             " [delegated]" : ""));
2968                 }
2969                 netpolicy_post_msg(KEV_NETPOLICY_IFDENIED, &ev_ifdenied.ev_data,
2970                     sizeof(ev_ifdenied));
2971         }
2972 }
2973
2974 /*
2975  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2976  */
2977 struct sockaddr *
2978 dup_sockaddr(struct sockaddr *sa, int canwait)
2979 {
2980         struct sockaddr *sa2;
2981
2982         MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
2983             canwait ? M_WAITOK : M_NOWAIT);
2984         if (sa2) {
2985                 bcopy(sa, sa2, sa->sa_len);
2986         }
2987         return sa2;
2988 }
2989
2990 /*
2991  * Create an external-format (``xsocket'') structure using the information
2992  * in the kernel-format socket structure pointed to by so.  This is done
2993  * to reduce the spew of irrelevant information over this interface,
2994  * to isolate user code from changes in the kernel structure, and
2995  * potentially to provide information-hiding if we decide that
2996  * some of this information should be hidden from users.
2997  */
2998 void
2999 sotoxsocket(struct socket *so, struct xsocket *xso)
3000 {
3001         xso->xso_len = sizeof(*xso);
3002         xso->xso_so = (_XSOCKET_PTR(struct socket *))VM_KERNEL_ADDRPERM(so);
3003         xso->so_type = so->so_type;
3004         xso->so_options = (short)(so->so_options & 0xffff);
3005         xso->so_linger = so->so_linger;
3006         xso->so_state = so->so_state;
3007         xso->so_pcb = (_XSOCKET_PTR(caddr_t))VM_KERNEL_ADDRPERM(so->so_pcb);
3008         if (so->so_proto) {
3009                 xso->xso_protocol = SOCK_PROTO(so);
3010                 xso->xso_family = SOCK_DOM(so);
3011         } else {
3012                 xso->xso_protocol = xso->xso_family = 0;
3013         }
3014         xso->so_qlen = so->so_qlen;
3015         xso->so_incqlen = so->so_incqlen;
3016         xso->so_qlimit = so->so_qlimit;
3017         xso->so_timeo = so->so_timeo;
3018         xso->so_error = so->so_error;
3019         xso->so_pgid = so->so_pgid;
3020         xso->so_oobmark = so->so_oobmark;
3021         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3022         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3023         xso->so_uid = kauth_cred_getuid(so->so_cred);
3024 }
3025
3026
3027 #if !CONFIG_EMBEDDED
3028
3029 void
3030 sotoxsocket64(struct socket *so, struct xsocket64 *xso)
3031 {
3032         xso->xso_len = sizeof(*xso);
3033         xso->xso_so = (u_int64_t)VM_KERNEL_ADDRPERM(so);
3034         xso->so_type = so->so_type;
3035         xso->so_options = (short)(so->so_options & 0xffff);
3036         xso->so_linger = so->so_linger;
3037         xso->so_state = so->so_state;
3038         xso->so_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
3039         if (so->so_proto) {
3040                 xso->xso_protocol = SOCK_PROTO(so);
3041                 xso->xso_family = SOCK_DOM(so);
3042         } else {
3043                 xso->xso_protocol = xso->xso_family = 0;
3044         }
3045         xso->so_qlen = so->so_qlen;
3046         xso->so_incqlen = so->so_incqlen;
3047         xso->so_qlimit = so->so_qlimit;
3048         xso->so_timeo = so->so_timeo;
3049         xso->so_error = so->so_error;
3050         xso->so_pgid = so->so_pgid;
3051         xso->so_oobmark = so->so_oobmark;
3052         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3053         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3054         xso->so_uid = kauth_cred_getuid(so->so_cred);
3055 }
3056
3057 #endif /* !CONFIG_EMBEDDED */
3058
3059 /*
3060  * This does the same for sockbufs.  Note that the xsockbuf structure,
3061  * since it is always embedded in a socket, does not include a self
3062  * pointer nor a length.  We make this entry point public in case
3063  * some other mechanism needs it.
3064  */
3065 void
3066 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
3067 {
3068         xsb->sb_cc = sb->sb_cc;
3069         xsb->sb_hiwat = sb->sb_hiwat;
3070         xsb->sb_mbcnt = sb->sb_mbcnt;
3071         xsb->sb_mbmax = sb->sb_mbmax;
3072         xsb->sb_lowat = sb->sb_lowat;
3073         xsb->sb_flags = sb->sb_flags;
3074         xsb->sb_timeo = (short)
3075             (sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick;
3076         if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) {
3077                 xsb->sb_timeo = 1;
3078         }
3079 }
3080
3081 /*
3082  * Based on the policy set by an all knowing decison maker, throttle sockets
3083  * that either have been marked as belonging to "background" process.
3084  */
3085 inline int
3086 soisthrottled(struct socket *so)
3087 {
3088         return so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND;
3089 }
3090
3091 inline int
3092 soisprivilegedtraffic(struct socket *so)
3093 {
3094         return (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS) ? 1 : 0;
3095 }
3096
3097 inline int
3098 soissrcbackground(struct socket *so)
3099 {
3100         return (so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND) ||
3101                IS_SO_TC_BACKGROUND(so->so_traffic_class);
3102 }
3103
3104 inline int
3105 soissrcrealtime(struct socket *so)
3106 {
3107         return so->so_traffic_class >= SO_TC_AV &&
3108                so->so_traffic_class <= SO_TC_VO;
3109 }
3110
3111 inline int
3112 soissrcbesteffort(struct socket *so)
3113 {
3114         return so->so_traffic_class == SO_TC_BE ||
3115                so->so_traffic_class == SO_TC_RD ||
3116                so->so_traffic_class == SO_TC_OAM;
3117 }
3118
3119 void
3120 soclearfastopen(struct socket *so)
3121 {
3122         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
3123                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
3124         }
3125
3126         if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
3127                 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
3128         }
3129 }
3130
3131 void
3132 sonullevent(struct socket *so, void *arg, uint32_t hint)
3133 {
3134 #pragma unused(so, arg, hint)
3135 }
3136
3137 /*
3138  * Here is the definition of some of the basic objects in the kern.ipc
3139  * branch of the MIB.
3140  */
3141 SYSCTL_NODE(_kern, KERN_IPC, ipc,
3142     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "IPC");
3143
3144 /* Check that the maximum socket buffer size is within a range */
3145
3146 static int
3147 sysctl_sb_max SYSCTL_HANDLER_ARGS
3148 {
3149 #pragma unused(oidp, arg1, arg2)
3150         u_int32_t new_value;
3151         int changed = 0;
3152         int error = sysctl_io_number(req, sb_max, sizeof(u_int32_t),
3153             &new_value, &changed);
3154         if (!error && changed) {
3155                 if (new_value > LOW_SB_MAX && new_value <= high_sb_max) {
3156                         sb_max = new_value;
3157                 } else {
3158                         error = ERANGE;
3159                 }
3160         }
3161         return error;
3162 }
3163
3164 SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf,
3165     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3166     &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size");
3167
3168 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor,
3169     CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, "");
3170
3171 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters,
3172     CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, "");
3173
3174 SYSCTL_INT(_kern_ipc, OID_AUTO, njcl,
3175     CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, "");
3176
3177 SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes,
3178     CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, "");
3179
3180 SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat,
3181     CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1,
3182     "Enable socket queue limit compatibility");
3183
3184 /*
3185  * Hack alert -- rdar://33572856
3186  * A loopback test we cannot change was failing because it sets
3187  * SO_SENDTIMEO to 5 seconds and that's also the value
3188  * of the minimum persist timer. Because of the persist timer,
3189  * the connection was not idle for 5 seconds and SO_SNDTIMEO
3190  * was not triggering at 5 seconds causing the test failure.
3191  * As a workaround we check the sysctl soqlencomp the test is already
3192  * setting to set disable auto tuning of the receive buffer.
3193  */
3194
3195 extern u_int32_t tcp_do_autorcvbuf;
3196
3197 static int
3198 sysctl_soqlencomp SYSCTL_HANDLER_ARGS
3199 {
3200 #pragma unused(oidp, arg1, arg2)
3201         u_int32_t new_value;
3202         int changed = 0;
3203         int error = sysctl_io_number(req, soqlencomp, sizeof(u_int32_t),
3204             &new_value, &changed);
3205         if (!error && changed) {
3206                 soqlencomp = new_value;
3207                 if (new_value != 0) {
3208                         tcp_do_autorcvbuf = 0;
3209                         tcptv_persmin_val = 6 * TCP_RETRANSHZ;
3210                 }
3211         }
3212         return error;
3213 }
3214 SYSCTL_PROC(_kern_ipc, OID_AUTO, soqlencomp,
3215     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3216     &soqlencomp, 0, &sysctl_soqlencomp, "IU", "");
3217
3218 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3219     &total_sbmb_cnt, 0, "");
3220 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_peak, CTLFLAG_RD | CTLFLAG_LOCKED,
3221     &total_sbmb_cnt_peak, 0, "");
3222 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_floor, CTLFLAG_RD | CTLFLAG_LOCKED,
3223     &total_sbmb_cnt_floor, 0, "");
3224 SYSCTL_QUAD(_kern_ipc, OID_AUTO, sbmb_limreached, CTLFLAG_RD | CTLFLAG_LOCKED,
3225     &sbmb_limreached, "");
3226
3227
3228 SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy");
3229
3230 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
3231     &net_io_policy_log, 0, "");
3232
3233 #if CONFIG_PROC_UUID_POLICY
3234 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, uuid, CTLFLAG_RW | CTLFLAG_LOCKED,
3235     &net_io_policy_uuid, 0, "");
3236 #endif /* CONFIG_PROC_UUID_POLICY */