bsd/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/domain.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/malloc.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/mcache.h>
  79 #include <sys/protosw.h>
  80 #include <sys/stat.h>
  81 #include <sys/socket.h>
  82 #include <sys/socketvar.h>
  83 #include <sys/signalvar.h>
  84 #include <sys/sysctl.h>
  85 #include <sys/syslog.h>
  86 #include <sys/unpcb.h>
  87 #include <sys/ev.h>
  88 #include <kern/locks.h>
  89 #include <net/route.h>
  90 #include <net/content_filter.h>
  91 #include <netinet/in.h>
  92 #include <netinet/in_pcb.h>
  93 #include <netinet/tcp_var.h>
  94 #include <sys/kdebug.h>
  95 #include <libkern/OSAtomic.h>
  96
  97 #if CONFIG_MACF
  98 #include <security/mac_framework.h>
  99 #endif
 100
 101 #include <mach/vm_param.h>
 102
 103 #if MPTCP
 104 #include <netinet/mptcp_var.h>
 105 #endif
 106
 107 #define DBG_FNC_SBDROP          NETDBG_CODE(DBG_NETSOCK, 4)
 108 #define DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
 109
 110 SYSCTL_DECL(_kern_ipc);
 111
 112 __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0;
 113 SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort,
 114     CTLFLAG_RW | CTLFLAG_LOCKED, &net_io_policy_throttle_best_effort, 0, "");
 115
 116 static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
 117 static struct socket *sonewconn_internal(struct socket *, int);
 118 static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
 119     struct mbuf *);
 120 static void soevent_ifdenied(struct socket *);
 121
 122 static int sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop);
 123 static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop);
 124
 125 /*
 126  * Primitive routines for operating on sockets and socket buffers
 127  */
 128 static int soqlimitcompat = 1;
 129 static int soqlencomp = 0;
 130
 131 /*
 132  * Based on the number of mbuf clusters configured, high_sb_max and sb_max can
 133  * get scaled up or down to suit that memory configuration. high_sb_max is a
 134  * higher limit on sb_max that is checked when sb_max gets set through sysctl.
 135  */
 136
 137 u_int32_t       sb_max = SB_MAX;                /* XXX should be static */
 138 u_int32_t       high_sb_max = SB_MAX;
 139
 140 static  u_int32_t sb_efficiency = 8;    /* parameter for sbreserve() */
 141 int32_t total_sbmb_cnt __attribute__((aligned(8))) = 0;
 142 int32_t total_sbmb_cnt_floor __attribute__((aligned(8))) = 0;
 143 int32_t total_sbmb_cnt_peak __attribute__((aligned(8))) = 0;
 144 int64_t sbmb_limreached __attribute__((aligned(8))) = 0;
 145
 146 u_int32_t net_io_policy_log = 0;        /* log socket policy changes */
 147 #if CONFIG_PROC_UUID_POLICY
 148 u_int32_t net_io_policy_uuid = 1;       /* enable UUID socket policy */
 149 #endif /* CONFIG_PROC_UUID_POLICY */
 150
 151 /*
 152  * Procedures to manipulate state flags of socket
 153  * and do appropriate wakeups.  Normal sequence from the
 154  * active (originating) side is that soisconnecting() is
 155  * called during processing of connect() call,
 156  * resulting in an eventual call to soisconnected() if/when the
 157  * connection is established.  When the connection is torn down
 158  * soisdisconnecting() is called during processing of disconnect() call,
 159  * and soisdisconnected() is called when the connection to the peer
 160  * is totally severed.  The semantics of these routines are such that
 161  * connectionless protocols can call soisconnected() and soisdisconnected()
 162  * only, bypassing the in-progress calls when setting up a ``connection''
 163  * takes no time.
 164  *
 165  * From the passive side, a socket is created with
 166  * two queues of sockets: so_incomp for connections in progress
 167  * and so_comp for connections already made and awaiting user acceptance.
 168  * As a protocol is preparing incoming connections, it creates a socket
 169  * structure queued on so_incomp by calling sonewconn().  When the connection
 170  * is established, soisconnected() is called, and transfers the
 171  * socket structure to so_comp, making it available to accept().
 172  *
 173  * If a socket is closed with sockets on either
 174  * so_incomp or so_comp, these sockets are dropped.
 175  *
 176  * If higher level protocols are implemented in
 177  * the kernel, the wakeups done here will sometimes
 178  * cause software-interrupt process scheduling.
 179  */
 180 void
 181 soisconnecting(struct socket *so)
 182 {
 183         so->so_state &= ~(SS_ISCONNECTED | SS_ISDISCONNECTING);
 184         so->so_state |= SS_ISCONNECTING;
 185
 186         sflt_notify(so, sock_evt_connecting, NULL);
 187 }
 188
 189 void
 190 soisconnected(struct socket *so)
 191 {
 192         /*
 193          * If socket is subject to filter and is pending initial verdict,
 194          * delay marking socket as connected and do not present the connected
 195          * socket to user just yet.
 196          */
 197         if (cfil_sock_connected_pending_verdict(so)) {
 198                 return;
 199         }
 200
 201         so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING);
 202         so->so_state |= SS_ISCONNECTED;
 203
 204         soreserve_preconnect(so, 0);
 205
 206         sflt_notify(so, sock_evt_connected, NULL);
 207
 208         if (so->so_head != NULL && (so->so_state & SS_INCOMP)) {
 209                 struct socket *head = so->so_head;
 210                 int locked = 0;
 211
 212                 /*
 213                  * Enforce lock order when the protocol has per socket locks
 214                  */
 215                 if (head->so_proto->pr_getlock != NULL) {
 216                         socket_lock(head, 1);
 217                         so_acquire_accept_list(head, so);
 218                         locked = 1;
 219                 }
 220                 if (so->so_head == head && (so->so_state & SS_INCOMP)) {
 221                         so->so_state &= ~SS_INCOMP;
 222                         so->so_state |= SS_COMP;
 223                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 224                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 225                         head->so_incqlen--;
 226
 227                         /*
 228                          * We have to release the accept list in
 229                          * case a socket callback calls sock_accept()
 230                          */
 231                         if (locked != 0) {
 232                                 so_release_accept_list(head);
 233                                 socket_unlock(so, 0);
 234                         }
 235                         sorwakeup(head);
 236                         wakeup_one((caddr_t)&head->so_timeo);
 237
 238                         if (locked != 0) {
 239                                 socket_unlock(head, 1);
 240                                 socket_lock(so, 0);
 241                         }
 242                 } else if (locked != 0) {
 243                         so_release_accept_list(head);
 244                         socket_unlock(head, 1);
 245                 }
 246         } else {
 247                 wakeup((caddr_t)&so->so_timeo);
 248                 sorwakeup(so);
 249                 sowwakeup(so);
 250                 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNECTED |
 251                     SO_FILT_HINT_CONNINFO_UPDATED);
 252         }
 253 }
 254
 255 boolean_t
 256 socanwrite(struct socket *so)
 257 {
 258         return (so->so_state & SS_ISCONNECTED) ||
 259                !(so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 260                (so->so_flags1 & SOF1_PRECONNECT_DATA);
 261 }
 262
 263 void
 264 soisdisconnecting(struct socket *so)
 265 {
 266         so->so_state &= ~SS_ISCONNECTING;
 267         so->so_state |= (SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE);
 268         soevent(so, SO_FILT_HINT_LOCKED);
 269         sflt_notify(so, sock_evt_disconnecting, NULL);
 270         wakeup((caddr_t)&so->so_timeo);
 271         sowwakeup(so);
 272         sorwakeup(so);
 273 }
 274
 275 void
 276 soisdisconnected(struct socket *so)
 277 {
 278         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 279         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 280         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 281             SO_FILT_HINT_CONNINFO_UPDATED);
 282         sflt_notify(so, sock_evt_disconnected, NULL);
 283         wakeup((caddr_t)&so->so_timeo);
 284         sowwakeup(so);
 285         sorwakeup(so);
 286
 287 #if CONTENT_FILTER
 288         /* Notify content filters as soon as we cannot send/receive data */
 289         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 290 #endif /* CONTENT_FILTER */
 291 }
 292
 293 /*
 294  * This function will issue a wakeup like soisdisconnected but it will not
 295  * notify the socket filters. This will avoid unlocking the socket
 296  * in the midst of closing it.
 297  */
 298 void
 299 sodisconnectwakeup(struct socket *so)
 300 {
 301         so->so_state &= ~(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 302         so->so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 303         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
 304             SO_FILT_HINT_CONNINFO_UPDATED);
 305         wakeup((caddr_t)&so->so_timeo);
 306         sowwakeup(so);
 307         sorwakeup(so);
 308
 309 #if CONTENT_FILTER
 310         /* Notify content filters as soon as we cannot send/receive data */
 311         cfil_sock_notify_shutdown(so, SHUT_RDWR);
 312 #endif /* CONTENT_FILTER */
 313 }
 314
 315 /*
 316  * When an attempt at a new connection is noted on a socket
 317  * which accepts connections, sonewconn is called.  If the
 318  * connection is possible (subject to space constraints, etc.)
 319  * then we allocate a new structure, propoerly linked into the
 320  * data structure of the original socket, and return this.
 321  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 322  */
 323 static struct socket *
 324 sonewconn_internal(struct socket *head, int connstatus)
 325 {
 326         int so_qlen, error = 0;
 327         struct socket *so;
 328         lck_mtx_t *mutex_held;
 329
 330         if (head->so_proto->pr_getlock != NULL) {
 331                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 332         } else {
 333                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 334         }
 335         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 336
 337         if (!soqlencomp) {
 338                 /*
 339                  * This is the default case; so_qlen represents the
 340                  * sum of both incomplete and completed queues.
 341                  */
 342                 so_qlen = head->so_qlen;
 343         } else {
 344                 /*
 345                  * When kern.ipc.soqlencomp is set to 1, so_qlen
 346                  * represents only the completed queue.  Since we
 347                  * cannot let the incomplete queue goes unbounded
 348                  * (in case of SYN flood), we cap the incomplete
 349                  * queue length to at most somaxconn, and use that
 350                  * as so_qlen so that we fail immediately below.
 351                  */
 352                 so_qlen = head->so_qlen - head->so_incqlen;
 353                 if (head->so_incqlen > somaxconn) {
 354                         so_qlen = somaxconn;
 355                 }
 356         }
 357
 358         if (so_qlen >=
 359             (soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2))) {
 360                 return (struct socket *)0;
 361         }
 362         so = soalloc(1, SOCK_DOM(head), head->so_type);
 363         if (so == NULL) {
 364                 return (struct socket *)0;
 365         }
 366         /* check if head was closed during the soalloc */
 367         if (head->so_proto == NULL) {
 368                 sodealloc(so);
 369                 return (struct socket *)0;
 370         }
 371
 372         so->so_type = head->so_type;
 373         so->so_options = head->so_options & ~SO_ACCEPTCONN;
 374         so->so_linger = head->so_linger;
 375         so->so_state = head->so_state | SS_NOFDREF;
 376         so->so_proto = head->so_proto;
 377         so->so_timeo = head->so_timeo;
 378         so->so_pgid  = head->so_pgid;
 379         kauth_cred_ref(head->so_cred);
 380         so->so_cred = head->so_cred;
 381         so->last_pid = head->last_pid;
 382         so->last_upid = head->last_upid;
 383         memcpy(so->last_uuid, head->last_uuid, sizeof(so->last_uuid));
 384         if (head->so_flags & SOF_DELEGATED) {
 385                 so->e_pid = head->e_pid;
 386                 so->e_upid = head->e_upid;
 387                 memcpy(so->e_uuid, head->e_uuid, sizeof(so->e_uuid));
 388         }
 389         /* inherit socket options stored in so_flags */
 390         so->so_flags = head->so_flags &
 391             (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID |
 392             SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT |
 393             SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS | SOF_NOTSENT_LOWAT |
 394             SOF_DELEGATED);
 395         so->so_flags1 |= SOF1_INBOUND;
 396         so->so_usecount = 1;
 397         so->next_lock_lr = 0;
 398         so->next_unlock_lr = 0;
 399
 400         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 401         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 402
 403         /* inherit traffic management properties of listener */
 404         so->so_flags1 |=
 405             head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND | SOF1_TC_NET_SERV_TYPE |
 406             SOF1_QOSMARKING_ALLOWED | SOF1_QOSMARKING_POLICY_OVERRIDE);
 407         so->so_background_thread = head->so_background_thread;
 408         so->so_traffic_class = head->so_traffic_class;
 409         so->so_netsvctype = head->so_netsvctype;
 410
 411         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 412                 sodealloc(so);
 413                 return (struct socket *)0;
 414         }
 415         so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE);
 416         so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE);
 417
 418         /*
 419          * Must be done with head unlocked to avoid deadlock
 420          * for protocol with per socket mutexes.
 421          */
 422         if (head->so_proto->pr_unlock) {
 423                 socket_unlock(head, 0);
 424         }
 425         if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) ||
 426             error) {
 427                 sodealloc(so);
 428                 if (head->so_proto->pr_unlock) {
 429                         socket_lock(head, 0);
 430                 }
 431                 return (struct socket *)0;
 432         }
 433         if (head->so_proto->pr_unlock) {
 434                 socket_lock(head, 0);
 435                 /*
 436                  * Radar 7385998 Recheck that the head is still accepting
 437                  * to avoid race condition when head is getting closed.
 438                  */
 439                 if ((head->so_options & SO_ACCEPTCONN) == 0) {
 440                         so->so_state &= ~SS_NOFDREF;
 441                         soclose(so);
 442                         return (struct socket *)0;
 443                 }
 444         }
 445
 446         if (so->so_proto->pr_copy_last_owner != NULL) {
 447                 (*so->so_proto->pr_copy_last_owner)(so, head);
 448         }
 449         atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1);
 450
 451         /* Insert in head appropriate lists */
 452         so_acquire_accept_list(head, NULL);
 453
 454         so->so_head = head;
 455
 456         /*
 457          * Since this socket is going to be inserted into the incomp
 458          * queue, it can be picked up by another thread in
 459          * tcp_dropdropablreq to get dropped before it is setup..
 460          * To prevent this race, set in-progress flag which can be
 461          * cleared later
 462          */
 463         so->so_flags |= SOF_INCOMP_INPROGRESS;
 464
 465         if (connstatus) {
 466                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 467                 so->so_state |= SS_COMP;
 468         } else {
 469                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 470                 so->so_state |= SS_INCOMP;
 471                 head->so_incqlen++;
 472         }
 473         head->so_qlen++;
 474
 475         so_release_accept_list(head);
 476
 477         /* Attach socket filters for this protocol */
 478         sflt_initsock(so);
 479
 480         if (connstatus) {
 481                 so->so_state |= connstatus;
 482                 sorwakeup(head);
 483                 wakeup((caddr_t)&head->so_timeo);
 484         }
 485         return so;
 486 }
 487
 488
 489 struct socket *
 490 sonewconn(struct socket *head, int connstatus, const struct sockaddr *from)
 491 {
 492         int error = sflt_connectin(head, from);
 493         if (error) {
 494                 return NULL;
 495         }
 496
 497         return sonewconn_internal(head, connstatus);
 498 }
 499
 500 /*
 501  * Socantsendmore indicates that no more data will be sent on the
 502  * socket; it would normally be applied to a socket when the user
 503  * informs the system that no more data is to be sent, by the protocol
 504  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 505  * will be received, and will normally be applied to the socket by a
 506  * protocol when it detects that the peer will send no more data.
 507  * Data queued for reading in the socket may yet be read.
 508  */
 509
 510 void
 511 socantsendmore(struct socket *so)
 512 {
 513         so->so_state |= SS_CANTSENDMORE;
 514         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTSENDMORE);
 515         sflt_notify(so, sock_evt_cantsendmore, NULL);
 516         sowwakeup(so);
 517 }
 518
 519 void
 520 socantrcvmore(struct socket *so)
 521 {
 522         so->so_state |= SS_CANTRCVMORE;
 523         soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
 524         sflt_notify(so, sock_evt_cantrecvmore, NULL);
 525         sorwakeup(so);
 526 }
 527
 528 /*
 529  * Wait for data to arrive at/drain from a socket buffer.
 530  */
 531 int
 532 sbwait(struct sockbuf *sb)
 533 {
 534         boolean_t nointr = (sb->sb_flags & SB_NOINTR);
 535         void *lr_saved = __builtin_return_address(0);
 536         struct socket *so = sb->sb_so;
 537         lck_mtx_t *mutex_held;
 538         struct timespec ts;
 539         int error = 0;
 540
 541         if (so == NULL) {
 542                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
 543                     __func__, sb, sb->sb_flags, lr_saved);
 544                 /* NOTREACHED */
 545         } else if (so->so_usecount < 1) {
 546                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 547                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 548                     so->so_usecount, lr_saved, solockhistory_nr(so));
 549                 /* NOTREACHED */
 550         }
 551
 552         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 553                 error = EBADF;
 554                 if (so->so_flags & SOF_DEFUNCT) {
 555                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 556                             "(%d)\n", __func__, proc_selfpid(),
 557                             proc_best_name(current_proc()),
 558                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 559                             SOCK_DOM(so), SOCK_TYPE(so), error);
 560                 }
 561                 return error;
 562         }
 563
 564         if (so->so_proto->pr_getlock != NULL) {
 565                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
 566         } else {
 567                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 568         }
 569
 570         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
 571
 572         ts.tv_sec = sb->sb_timeo.tv_sec;
 573         ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
 574
 575         sb->sb_waiters++;
 576         VERIFY(sb->sb_waiters != 0);
 577
 578         error = msleep((caddr_t)&sb->sb_cc, mutex_held,
 579             nointr ? PSOCK : PSOCK | PCATCH,
 580             nointr ? "sbwait_nointr" : "sbwait", &ts);
 581
 582         VERIFY(sb->sb_waiters != 0);
 583         sb->sb_waiters--;
 584
 585         if (so->so_usecount < 1) {
 586                 panic("%s: 2 sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
 587                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
 588                     so->so_usecount, lr_saved, solockhistory_nr(so));
 589                 /* NOTREACHED */
 590         }
 591
 592         if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
 593                 error = EBADF;
 594                 if (so->so_flags & SOF_DEFUNCT) {
 595                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 596                             "(%d)\n", __func__, proc_selfpid(),
 597                             proc_best_name(current_proc()),
 598                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 599                             SOCK_DOM(so), SOCK_TYPE(so), error);
 600                 }
 601         }
 602
 603         return error;
 604 }
 605
 606 void
 607 sbwakeup(struct sockbuf *sb)
 608 {
 609         if (sb->sb_waiters > 0) {
 610                 wakeup((caddr_t)&sb->sb_cc);
 611         }
 612 }
 613
 614 /*
 615  * Wakeup processes waiting on a socket buffer.
 616  * Do asynchronous notification via SIGIO
 617  * if the socket has the SS_ASYNC flag set.
 618  */
 619 void
 620 sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2)
 621 {
 622         if (so->so_flags & SOF_DEFUNCT) {
 623                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
 624                     "fl 0x%x [%s]\n", __func__, proc_selfpid(),
 625                     proc_best_name(current_proc()),
 626                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
 627                     SOCK_TYPE(so), (uint32_t)sb->sb_sel.si_flags, sb->sb_flags,
 628                     (sb->sb_flags & SB_RECV) ? "rcv" : "snd");
 629         }
 630
 631         sb->sb_flags &= ~SB_SEL;
 632         selwakeup(&sb->sb_sel);
 633         sbwakeup(sb);
 634         if (so->so_state & SS_ASYNC) {
 635                 if (so->so_pgid < 0) {
 636                         gsignal(-so->so_pgid, SIGIO);
 637                 } else if (so->so_pgid > 0) {
 638                         proc_signal(so->so_pgid, SIGIO);
 639                 }
 640         }
 641         if (sb->sb_flags & SB_KNOTE) {
 642                 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
 643         }
 644         if (sb->sb_flags & SB_UPCALL) {
 645                 void (*sb_upcall)(struct socket *, void *, int);
 646                 caddr_t sb_upcallarg;
 647                 int lock = !(sb->sb_flags & SB_UPCALL_LOCK);
 648
 649                 sb_upcall = sb->sb_upcall;
 650                 sb_upcallarg = sb->sb_upcallarg;
 651                 /* Let close know that we're about to do an upcall */
 652                 so->so_upcallusecount++;
 653
 654                 if (lock) {
 655                         if (so2) {
 656                                 struct unpcb *unp = sotounpcb(so2);
 657                                 unp->unp_flags |= UNP_DONTDISCONNECT;
 658                                 unp->rw_thrcount++;
 659
 660                                 socket_unlock(so2, 0);
 661                         }
 662                         socket_unlock(so, 0);
 663                 }
 664                 (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT);
 665                 if (lock) {
 666                         if (so2 && so > so2) {
 667                                 struct unpcb *unp;
 668                                 socket_lock(so2, 0);
 669
 670                                 unp = sotounpcb(so2);
 671                                 unp->rw_thrcount--;
 672                                 if (unp->rw_thrcount == 0) {
 673                                         unp->unp_flags &= ~UNP_DONTDISCONNECT;
 674                                         wakeup(unp);
 675                                 }
 676                         }
 677
 678                         socket_lock(so, 0);
 679
 680                         if (so2 && so < so2) {
 681                                 struct unpcb *unp;
 682                                 socket_lock(so2, 0);
 683
 684                                 unp = sotounpcb(so2);
 685                                 unp->rw_thrcount--;
 686                                 if (unp->rw_thrcount == 0) {
 687                                         unp->unp_flags &= ~UNP_DONTDISCONNECT;
 688                                         wakeup(unp);
 689                                 }
 690                         }
 691                 }
 692
 693                 so->so_upcallusecount--;
 694                 /* Tell close that it's safe to proceed */
 695                 if ((so->so_flags & SOF_CLOSEWAIT) &&
 696                     so->so_upcallusecount == 0) {
 697                         wakeup((caddr_t)&so->so_upcallusecount);
 698                 }
 699         }
 700 #if CONTENT_FILTER
 701         /*
 702          * Trap disconnection events for content filters
 703          */
 704         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
 705                 if ((sb->sb_flags & SB_RECV)) {
 706                         if (so->so_state & (SS_CANTRCVMORE)) {
 707                                 cfil_sock_notify_shutdown(so, SHUT_RD);
 708                         }
 709                 } else {
 710                         if (so->so_state & (SS_CANTSENDMORE)) {
 711                                 cfil_sock_notify_shutdown(so, SHUT_WR);
 712                         }
 713                 }
 714         }
 715 #endif /* CONTENT_FILTER */
 716 }
 717
 718 /*
 719  * Socket buffer (struct sockbuf) utility routines.
 720  *
 721  * Each socket contains two socket buffers: one for sending data and
 722  * one for receiving data.  Each buffer contains a queue of mbufs,
 723  * information about the number of mbufs and amount of data in the
 724  * queue, and other fields allowing select() statements and notification
 725  * on data availability to be implemented.
 726  *
 727  * Data stored in a socket buffer is maintained as a list of records.
 728  * Each record is a list of mbufs chained together with the m_next
 729  * field.  Records are chained together with the m_nextpkt field. The upper
 730  * level routine soreceive() expects the following conventions to be
 731  * observed when placing information in the receive buffer:
 732  *
 733  * 1. If the protocol requires each message be preceded by the sender's
 734  *    name, then a record containing that name must be present before
 735  *    any associated data (mbuf's must be of type MT_SONAME).
 736  * 2. If the protocol supports the exchange of ``access rights'' (really
 737  *    just additional data associated with the message), and there are
 738  *    ``rights'' to be received, then a record containing this data
 739  *    should be present (mbuf's must be of type MT_RIGHTS).
 740  * 3. If a name or rights record exists, then it must be followed by
 741  *    a data record, perhaps of zero length.
 742  *
 743  * Before using a new socket structure it is first necessary to reserve
 744  * buffer space to the socket, by calling sbreserve().  This should commit
 745  * some of the available buffer space in the system buffer pool for the
 746  * socket (currently, it does nothing but enforce limits).  The space
 747  * should be released by calling sbrelease() when the socket is destroyed.
 748  */
 749
 750 /*
 751  * Returns:     0                       Success
 752  *              ENOBUFS
 753  */
 754 int
 755 soreserve(struct socket *so, uint32_t sndcc, uint32_t rcvcc)
 756 {
 757         /*
 758          * We do not want to fail the creation of a socket
 759          * when kern.ipc.maxsockbuf is less than the
 760          * default socket buffer socket size of the protocol
 761          * so force the buffer sizes to be at most the
 762          * limit enforced by sbreserve()
 763          */
 764         uint64_t maxcc = (uint64_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 765         if (sndcc > maxcc) {
 766                 sndcc = (uint32_t)maxcc;
 767         }
 768         if (rcvcc > maxcc) {
 769                 rcvcc = (uint32_t)maxcc;
 770         }
 771         if (sbreserve(&so->so_snd, sndcc) == 0) {
 772                 goto bad;
 773         } else {
 774                 so->so_snd.sb_idealsize = sndcc;
 775         }
 776
 777         if (sbreserve(&so->so_rcv, rcvcc) == 0) {
 778                 goto bad2;
 779         } else {
 780                 so->so_rcv.sb_idealsize = rcvcc;
 781         }
 782
 783         if (so->so_rcv.sb_lowat == 0) {
 784                 so->so_rcv.sb_lowat = 1;
 785         }
 786         if (so->so_snd.sb_lowat == 0) {
 787                 so->so_snd.sb_lowat = MCLBYTES;
 788         }
 789         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) {
 790                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 791         }
 792         return 0;
 793 bad2:
 794         so->so_snd.sb_flags &= ~SB_SEL;
 795         selthreadclear(&so->so_snd.sb_sel);
 796         sbrelease(&so->so_snd);
 797 bad:
 798         return ENOBUFS;
 799 }
 800
 801 void
 802 soreserve_preconnect(struct socket *so, unsigned int pre_cc)
 803 {
 804         /* As of now, same bytes for both preconnect read and write */
 805         so->so_snd.sb_preconn_hiwat = pre_cc;
 806         so->so_rcv.sb_preconn_hiwat = pre_cc;
 807 }
 808
 809 /*
 810  * Allot mbufs to a sockbuf.
 811  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 812  * if buffering efficiency is near the normal case.
 813  */
 814 int
 815 sbreserve(struct sockbuf *sb, u_int32_t cc)
 816 {
 817         if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES) ||
 818             (cc > sb->sb_hiwat && (sb->sb_flags & SB_LIMITED))) {
 819                 return 0;
 820         }
 821         sb->sb_hiwat = cc;
 822         sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 823         if (sb->sb_lowat > sb->sb_hiwat) {
 824                 sb->sb_lowat = sb->sb_hiwat;
 825         }
 826         return 1;
 827 }
 828
 829 /*
 830  * Free mbufs held by a socket, and reserved mbuf space.
 831  */
 832 /*  WARNING needs to do selthreadclear() before calling this */
 833 void
 834 sbrelease(struct sockbuf *sb)
 835 {
 836         sbflush(sb);
 837         sb->sb_hiwat = 0;
 838         sb->sb_mbmax = 0;
 839 }
 840
 841 /*
 842  * Routines to add and remove
 843  * data from an mbuf queue.
 844  *
 845  * The routines sbappend() or sbappendrecord() are normally called to
 846  * append new mbufs to a socket buffer, after checking that adequate
 847  * space is available, comparing the function sbspace() with the amount
 848  * of data to be added.  sbappendrecord() differs from sbappend() in
 849  * that data supplied is treated as the beginning of a new record.
 850  * To place a sender's address, optional access rights, and data in a
 851  * socket receive buffer, sbappendaddr() should be used.  To place
 852  * access rights and data in a socket receive buffer, sbappendrights()
 853  * should be used.  In either case, the new data begins a new record.
 854  * Note that unlike sbappend() and sbappendrecord(), these routines check
 855  * for the caller that there will be enough space to store the data.
 856  * Each fails if there is not enough space, or if it cannot find mbufs
 857  * to store additional information in.
 858  *
 859  * Reliable protocols may use the socket send buffer to hold data
 860  * awaiting acknowledgement.  Data is normally copied from a socket
 861  * send buffer in a protocol with m_copy for output to a peer,
 862  * and then removing the data from the socket buffer with sbdrop()
 863  * or sbdroprecord() when the data is acknowledged by the peer.
 864  */
 865
 866 /*
 867  * Append mbuf chain m to the last record in the
 868  * socket buffer sb.  The additional space associated
 869  * the mbuf chain is recorded in sb.  Empty mbufs are
 870  * discarded and mbufs are compacted where possible.
 871  */
 872 static int
 873 sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop)
 874 {
 875         struct socket *so = sb->sb_so;
 876
 877         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 878                 if (m != NULL && !nodrop) {
 879                         m_freem(m);
 880                 }
 881                 return 0;
 882         }
 883
 884         SBLASTRECORDCHK(sb, "sbappend 1");
 885
 886         if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) {
 887                 return sbappendrecord_common(sb, m, nodrop);
 888         }
 889
 890         if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
 891                 ASSERT(nodrop == FALSE);
 892                 if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 893                         int error = sflt_data_in(so, NULL, &m, NULL, 0);
 894                         SBLASTRECORDCHK(sb, "sbappend 2");
 895
 896 #if CONTENT_FILTER
 897                         if (error == 0) {
 898                                 error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 899                         }
 900 #endif /* CONTENT_FILTER */
 901
 902                         if (error != 0) {
 903                                 if (error != EJUSTRETURN) {
 904                                         m_freem(m);
 905                                 }
 906                                 return 0;
 907                         }
 908                 } else if (m) {
 909                         m->m_flags &= ~M_SKIPCFIL;
 910                 }
 911         }
 912
 913         /* If this is the first record, it's also the last record */
 914         if (sb->sb_lastrecord == NULL) {
 915                 sb->sb_lastrecord = m;
 916         }
 917
 918         sbcompress(sb, m, sb->sb_mbtail);
 919         SBLASTRECORDCHK(sb, "sbappend 3");
 920         return 1;
 921 }
 922
 923 int
 924 sbappend(struct sockbuf *sb, struct mbuf *m)
 925 {
 926         return sbappend_common(sb, m, FALSE);
 927 }
 928
 929 int
 930 sbappend_nodrop(struct sockbuf *sb, struct mbuf *m)
 931 {
 932         return sbappend_common(sb, m, TRUE);
 933 }
 934
 935 /*
 936  * Similar to sbappend, except that this is optimized for stream sockets.
 937  */
 938 int
 939 sbappendstream(struct sockbuf *sb, struct mbuf *m)
 940 {
 941         struct socket *so = sb->sb_so;
 942
 943         if (m == NULL || (sb->sb_flags & SB_DROP)) {
 944                 if (m != NULL) {
 945                         m_freem(m);
 946                 }
 947                 return 0;
 948         }
 949
 950         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
 951                 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
 952                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
 953                 /* NOTREACHED */
 954         }
 955
 956         SBLASTMBUFCHK(sb, __func__);
 957
 958         if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
 959                 if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
 960                         int error = sflt_data_in(so, NULL, &m, NULL, 0);
 961                         SBLASTRECORDCHK(sb, "sbappendstream 1");
 962
 963 #if CONTENT_FILTER
 964                         if (error == 0) {
 965                                 error = cfil_sock_data_in(so, NULL, m, NULL, 0);
 966                         }
 967 #endif /* CONTENT_FILTER */
 968
 969                         if (error != 0) {
 970                                 if (error != EJUSTRETURN) {
 971                                         m_freem(m);
 972                                 }
 973                                 return 0;
 974                         }
 975                 } else if (m) {
 976                         m->m_flags &= ~M_SKIPCFIL;
 977                 }
 978         }
 979
 980         sbcompress(sb, m, sb->sb_mbtail);
 981         sb->sb_lastrecord = sb->sb_mb;
 982         SBLASTRECORDCHK(sb, "sbappendstream 2");
 983         return 1;
 984 }
 985
 986 #ifdef SOCKBUF_DEBUG
 987 void
 988 sbcheck(struct sockbuf *sb)
 989 {
 990         struct mbuf *m;
 991         struct mbuf *n = 0;
 992         u_int32_t len = 0, mbcnt = 0;
 993         lck_mtx_t *mutex_held;
 994
 995         if (sb->sb_so->so_proto->pr_getlock != NULL) {
 996                 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
 997         } else {
 998                 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
 999         }
1000
1001         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1002
1003         if (sbchecking == 0) {
1004                 return;
1005         }
1006
1007         for (m = sb->sb_mb; m; m = n) {
1008                 n = m->m_nextpkt;
1009                 for (; m; m = m->m_next) {
1010                         len += m->m_len;
1011                         mbcnt += MSIZE;
1012                         /* XXX pretty sure this is bogus */
1013                         if (m->m_flags & M_EXT) {
1014                                 mbcnt += m->m_ext.ext_size;
1015                         }
1016                 }
1017         }
1018         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
1019                 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
1020                     mbcnt, sb->sb_mbcnt);
1021         }
1022 }
1023 #endif
1024
1025 void
1026 sblastrecordchk(struct sockbuf *sb, const char *where)
1027 {
1028         struct mbuf *m = sb->sb_mb;
1029
1030         while (m && m->m_nextpkt) {
1031                 m = m->m_nextpkt;
1032         }
1033
1034         if (m != sb->sb_lastrecord) {
1035                 printf("sblastrecordchk: mb 0x%llx lastrecord 0x%llx "
1036                     "last 0x%llx\n",
1037                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
1038                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_lastrecord),
1039                     (uint64_t)VM_KERNEL_ADDRPERM(m));
1040                 printf("packet chain:\n");
1041                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
1042                         printf("\t0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(m));
1043                 }
1044                 panic("sblastrecordchk from %s", where);
1045         }
1046 }
1047
1048 void
1049 sblastmbufchk(struct sockbuf *sb, const char *where)
1050 {
1051         struct mbuf *m = sb->sb_mb;
1052         struct mbuf *n;
1053
1054         while (m && m->m_nextpkt) {
1055                 m = m->m_nextpkt;
1056         }
1057
1058         while (m && m->m_next) {
1059                 m = m->m_next;
1060         }
1061
1062         if (m != sb->sb_mbtail) {
1063                 printf("sblastmbufchk: mb 0x%llx mbtail 0x%llx last 0x%llx\n",
1064                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
1065                     (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mbtail),
1066                     (uint64_t)VM_KERNEL_ADDRPERM(m));
1067                 printf("packet tree:\n");
1068                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
1069                         printf("\t");
1070                         for (n = m; n != NULL; n = n->m_next) {
1071                                 printf("0x%llx ",
1072                                     (uint64_t)VM_KERNEL_ADDRPERM(n));
1073                         }
1074                         printf("\n");
1075                 }
1076                 panic("sblastmbufchk from %s", where);
1077         }
1078 }
1079
1080 /*
1081  * Similar to sbappend, except the mbuf chain begins a new record.
1082  */
1083 static int
1084 sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop)
1085 {
1086         struct mbuf *m;
1087         int space = 0;
1088
1089         if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
1090                 if (m0 != NULL && nodrop == FALSE) {
1091                         m_freem(m0);
1092                 }
1093                 return 0;
1094         }
1095
1096         for (m = m0; m != NULL; m = m->m_next) {
1097                 space += m->m_len;
1098         }
1099
1100         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1101                 if (nodrop == FALSE) {
1102                         m_freem(m0);
1103                 }
1104                 return 0;
1105         }
1106
1107         if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
1108                 ASSERT(nodrop == FALSE);
1109                 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1110                         int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
1111                             sock_data_filt_flag_record);
1112
1113 #if CONTENT_FILTER
1114                         if (error == 0) {
1115                                 error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
1116                         }
1117 #endif /* CONTENT_FILTER */
1118
1119                         if (error != 0) {
1120                                 SBLASTRECORDCHK(sb, "sbappendrecord 1");
1121                                 if (error != EJUSTRETURN) {
1122                                         m_freem(m0);
1123                                 }
1124                                 return 0;
1125                         }
1126                 } else if (m0) {
1127                         m0->m_flags &= ~M_SKIPCFIL;
1128                 }
1129         }
1130
1131         /*
1132          * Note this permits zero length records.
1133          */
1134         sballoc(sb, m0);
1135         SBLASTRECORDCHK(sb, "sbappendrecord 2");
1136         if (sb->sb_lastrecord != NULL) {
1137                 sb->sb_lastrecord->m_nextpkt = m0;
1138         } else {
1139                 sb->sb_mb = m0;
1140         }
1141         sb->sb_lastrecord = m0;
1142         sb->sb_mbtail = m0;
1143
1144         m = m0->m_next;
1145         m0->m_next = 0;
1146         if (m && (m0->m_flags & M_EOR)) {
1147                 m0->m_flags &= ~M_EOR;
1148                 m->m_flags |= M_EOR;
1149         }
1150         sbcompress(sb, m, m0);
1151         SBLASTRECORDCHK(sb, "sbappendrecord 3");
1152         return 1;
1153 }
1154
1155 int
1156 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1157 {
1158         return sbappendrecord_common(sb, m0, FALSE);
1159 }
1160
1161 int
1162 sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0)
1163 {
1164         return sbappendrecord_common(sb, m0, TRUE);
1165 }
1166
1167 /*
1168  * Concatenate address (optional), control (optional) and data into one
1169  * single mbuf chain.  If sockbuf *sb is passed in, space check will be
1170  * performed.
1171  *
1172  * Returns:     mbuf chain pointer if succeeded, NULL if failed
1173  */
1174 struct mbuf *
1175 sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control)
1176 {
1177         struct mbuf *m = NULL, *n = NULL;
1178         int space = 0;
1179
1180         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1181                 panic("sbconcat_mbufs");
1182         }
1183
1184         if (m0) {
1185                 space += m0->m_pkthdr.len;
1186         }
1187         for (n = control; n; n = n->m_next) {
1188                 space += n->m_len;
1189                 if (n->m_next == 0) {   /* keep pointer to last control buf */
1190                         break;
1191                 }
1192         }
1193
1194         if (asa != NULL) {
1195                 if (asa->sa_len > MLEN) {
1196                         return NULL;
1197                 }
1198                 space += asa->sa_len;
1199         }
1200
1201         if (sb != NULL && space > sbspace(sb)) {
1202                 return NULL;
1203         }
1204
1205         if (n) {
1206                 n->m_next = m0;         /* concatenate data to control */
1207         } else {
1208                 control = m0;
1209         }
1210
1211         if (asa != NULL) {
1212                 MGET(m, M_DONTWAIT, MT_SONAME);
1213                 if (m == 0) {
1214                         if (n) {
1215                                 /* unchain control and data if necessary */
1216                                 n->m_next = NULL;
1217                         }
1218                         return NULL;
1219                 }
1220                 m->m_len = asa->sa_len;
1221                 bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
1222
1223                 m->m_next = control;
1224         } else {
1225                 m = control;
1226         }
1227
1228         return m;
1229 }
1230
1231 /*
1232  * Queue mbuf chain to the receive queue of a socket.
1233  * Parameter space is the total len of the mbuf chain.
1234  * If passed in, sockbuf space will be checked.
1235  *
1236  * Returns:     0               Invalid mbuf chain
1237  *                      1               Success
1238  */
1239 int
1240 sbappendchain(struct sockbuf *sb, struct mbuf *m, int space)
1241 {
1242         struct mbuf *n, *nlast;
1243
1244         if (m == NULL) {
1245                 return 0;
1246         }
1247
1248         if (space != 0 && space > sbspace(sb)) {
1249                 return 0;
1250         }
1251
1252         for (n = m; n->m_next != NULL; n = n->m_next) {
1253                 sballoc(sb, n);
1254         }
1255         sballoc(sb, n);
1256         nlast = n;
1257
1258         if (sb->sb_lastrecord != NULL) {
1259                 sb->sb_lastrecord->m_nextpkt = m;
1260         } else {
1261                 sb->sb_mb = m;
1262         }
1263         sb->sb_lastrecord = m;
1264         sb->sb_mbtail = nlast;
1265
1266         SBLASTMBUFCHK(sb, __func__);
1267         SBLASTRECORDCHK(sb, "sbappendadddr 2");
1268         return 1;
1269 }
1270
1271 /*
1272  * Returns:     0                       Error: No space/out of mbufs/etc.
1273  *              1                       Success
1274  *
1275  * Imputed:     (*error_out)            errno for error
1276  *              ENOBUFS
1277  *      sflt_data_in:???                [whatever a filter author chooses]
1278  */
1279 int
1280 sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
1281     struct mbuf *control, int *error_out)
1282 {
1283         int result = 0;
1284         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1285         struct mbuf *mbuf_chain = NULL;
1286
1287         if (error_out) {
1288                 *error_out = 0;
1289         }
1290
1291         if (m0 && (m0->m_flags & M_PKTHDR) == 0) {
1292                 panic("sbappendaddrorfree");
1293         }
1294
1295         if (sb->sb_flags & SB_DROP) {
1296                 if (m0 != NULL) {
1297                         m_freem(m0);
1298                 }
1299                 if (control != NULL && !sb_unix) {
1300                         m_freem(control);
1301                 }
1302                 if (error_out != NULL) {
1303                         *error_out = EINVAL;
1304                 }
1305                 return 0;
1306         }
1307
1308         if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
1309                 /* Call socket data in filters */
1310                 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1311                         int error;
1312                         error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
1313                         SBLASTRECORDCHK(sb, __func__);
1314
1315 #if CONTENT_FILTER
1316                         if (error == 0) {
1317                                 error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
1318                                     0);
1319                         }
1320 #endif /* CONTENT_FILTER */
1321
1322                         if (error) {
1323                                 if (error != EJUSTRETURN) {
1324                                         if (m0) {
1325                                                 m_freem(m0);
1326                                         }
1327                                         if (control != NULL && !sb_unix) {
1328                                                 m_freem(control);
1329                                         }
1330                                         if (error_out) {
1331                                                 *error_out = error;
1332                                         }
1333                                 }
1334                                 return 0;
1335                         }
1336                 } else if (m0) {
1337                         m0->m_flags &= ~M_SKIPCFIL;
1338                 }
1339         }
1340
1341         mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
1342         SBLASTRECORDCHK(sb, "sbappendadddr 1");
1343         result = sbappendchain(sb, mbuf_chain, 0);
1344         if (result == 0) {
1345                 if (m0) {
1346                         m_freem(m0);
1347                 }
1348                 if (control != NULL && !sb_unix) {
1349                         m_freem(control);
1350                 }
1351                 if (error_out) {
1352                         *error_out = ENOBUFS;
1353                 }
1354         }
1355
1356         return result;
1357 }
1358
1359 inline boolean_t
1360 is_cmsg_valid(struct mbuf *control, struct cmsghdr *cmsg)
1361 {
1362         if (cmsg == NULL) {
1363                 return FALSE;
1364         }
1365
1366         if (cmsg->cmsg_len < sizeof(struct cmsghdr)) {
1367                 return FALSE;
1368         }
1369
1370         if ((uint8_t *)control->m_data >= (uint8_t *)cmsg + cmsg->cmsg_len) {
1371                 return FALSE;
1372         }
1373
1374         if ((uint8_t *)control->m_data + control->m_len <
1375             (uint8_t *)cmsg + cmsg->cmsg_len) {
1376                 return FALSE;
1377         }
1378
1379         return TRUE;
1380 }
1381
1382 static int
1383 sbappendcontrol_internal(struct sockbuf *sb, struct mbuf *m0,
1384     struct mbuf *control)
1385 {
1386         struct mbuf *m, *mlast, *n;
1387         int space = 0;
1388
1389         if (control == 0) {
1390                 panic("sbappendcontrol");
1391         }
1392
1393         for (m = control;; m = m->m_next) {
1394                 space += m->m_len;
1395                 if (m->m_next == 0) {
1396                         break;
1397                 }
1398         }
1399         n = m;                  /* save pointer to last control buffer */
1400         for (m = m0; m; m = m->m_next) {
1401                 space += m->m_len;
1402         }
1403         if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
1404                 return 0;
1405         }
1406         n->m_next = m0;                 /* concatenate data to control */
1407         SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1408
1409         for (m = control; m->m_next != NULL; m = m->m_next) {
1410                 sballoc(sb, m);
1411         }
1412         sballoc(sb, m);
1413         mlast = m;
1414
1415         if (sb->sb_lastrecord != NULL) {
1416                 sb->sb_lastrecord->m_nextpkt = control;
1417         } else {
1418                 sb->sb_mb = control;
1419         }
1420         sb->sb_lastrecord = control;
1421         sb->sb_mbtail = mlast;
1422
1423         SBLASTMBUFCHK(sb, __func__);
1424         SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1425         return 1;
1426 }
1427
1428 int
1429 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
1430     int *error_out)
1431 {
1432         int result = 0;
1433         boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1434
1435         if (error_out) {
1436                 *error_out = 0;
1437         }
1438
1439         if (sb->sb_flags & SB_DROP) {
1440                 if (m0 != NULL) {
1441                         m_freem(m0);
1442                 }
1443                 if (control != NULL && !sb_unix) {
1444                         m_freem(control);
1445                 }
1446                 if (error_out != NULL) {
1447                         *error_out = EINVAL;
1448                 }
1449                 return 0;
1450         }
1451
1452         if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
1453                 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1454                         int error;
1455
1456                         error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
1457                         SBLASTRECORDCHK(sb, __func__);
1458
1459 #if CONTENT_FILTER
1460                         if (error == 0) {
1461                                 error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
1462                                     0);
1463                         }
1464 #endif /* CONTENT_FILTER */
1465
1466                         if (error) {
1467                                 if (error != EJUSTRETURN) {
1468                                         if (m0) {
1469                                                 m_freem(m0);
1470                                         }
1471                                         if (control != NULL && !sb_unix) {
1472                                                 m_freem(control);
1473                                         }
1474                                         if (error_out) {
1475                                                 *error_out = error;
1476                                         }
1477                                 }
1478                                 return 0;
1479                         }
1480                 } else if (m0) {
1481                         m0->m_flags &= ~M_SKIPCFIL;
1482                 }
1483         }
1484
1485         result = sbappendcontrol_internal(sb, m0, control);
1486         if (result == 0) {
1487                 if (m0) {
1488                         m_freem(m0);
1489                 }
1490                 if (control != NULL && !sb_unix) {
1491                         m_freem(control);
1492                 }
1493                 if (error_out) {
1494                         *error_out = ENOBUFS;
1495                 }
1496         }
1497
1498         return result;
1499 }
1500
1501 /*
1502  * TCP streams have Multipath TCP support or are regular TCP sockets.
1503  */
1504 int
1505 sbappendstream_rcvdemux(struct socket *so, struct mbuf *m)
1506 {
1507         int ret = 0;
1508
1509         if ((m != NULL) &&
1510             m_pktlen(m) <= 0 &&
1511             !((so->so_flags & SOF_MP_SUBFLOW) &&
1512             (m->m_flags & M_PKTHDR) &&
1513             (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1514                 m_freem(m);
1515                 return ret;
1516         }
1517
1518 #if MPTCP
1519         if (so->so_flags & SOF_MP_SUBFLOW) {
1520                 return sbappendmptcpstream_rcv(&so->so_rcv, m);
1521         } else
1522 #endif /* MPTCP */
1523         {
1524                 return sbappendstream(&so->so_rcv, m);
1525         }
1526 }
1527
1528 #if MPTCP
1529 int
1530 sbappendmptcpstream_rcv(struct sockbuf *sb, struct mbuf *m)
1531 {
1532         struct socket *so = sb->sb_so;
1533
1534         VERIFY(m == NULL || (m->m_flags & M_PKTHDR));
1535         /* SB_NOCOMPRESS must be set prevent loss of M_PKTHDR data */
1536         VERIFY((sb->sb_flags & (SB_RECV | SB_NOCOMPRESS)) ==
1537             (SB_RECV | SB_NOCOMPRESS));
1538
1539         if (m == NULL || m_pktlen(m) == 0 || (sb->sb_flags & SB_DROP) ||
1540             (so->so_state & SS_CANTRCVMORE)) {
1541                 if (m && (m->m_flags & M_PKTHDR) &&
1542                     m_pktlen(m) == 0 &&
1543                     (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
1544                         mptcp_input(tptomptp(sototcpcb(so))->mpt_mpte, m);
1545                         return 1;
1546                 } else if (m != NULL) {
1547                         m_freem(m);
1548                 }
1549                 return 0;
1550         }
1551         /* the socket is not closed, so SOF_MP_SUBFLOW must be set */
1552         VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1553
1554         if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1555                 panic("%s: nexpkt %p || mb %p != lastrecord %p\n", __func__,
1556                     m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1557                 /* NOTREACHED */
1558         }
1559
1560         SBLASTMBUFCHK(sb, __func__);
1561
1562         /* No filter support (SB_RECV) on mptcp subflow sockets */
1563
1564         sbcompress(sb, m, sb->sb_mbtail);
1565         sb->sb_lastrecord = sb->sb_mb;
1566         SBLASTRECORDCHK(sb, __func__);
1567         return 1;
1568 }
1569 #endif /* MPTCP */
1570
1571 /*
1572  * Compress mbuf chain m into the socket
1573  * buffer sb following mbuf n.  If n
1574  * is null, the buffer is presumed empty.
1575  */
1576 static inline void
1577 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1578 {
1579         int eor = 0, compress = (!(sb->sb_flags & SB_NOCOMPRESS));
1580         struct mbuf *o;
1581
1582         if (m == NULL) {
1583                 /* There is nothing to compress; just update the tail */
1584                 for (; n->m_next != NULL; n = n->m_next) {
1585                         ;
1586                 }
1587                 sb->sb_mbtail = n;
1588                 goto done;
1589         }
1590
1591         while (m != NULL) {
1592                 eor |= m->m_flags & M_EOR;
1593                 if (compress && m->m_len == 0 && (eor == 0 ||
1594                     (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) {
1595                         if (sb->sb_lastrecord == m) {
1596                                 sb->sb_lastrecord = m->m_next;
1597                         }
1598                         m = m_free(m);
1599                         continue;
1600                 }
1601                 if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
1602 #ifndef __APPLE__
1603                     M_WRITABLE(n) &&
1604 #endif
1605                     m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1606                     m->m_len <= M_TRAILINGSPACE(n) &&
1607                     n->m_type == m->m_type) {
1608                         bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1609                             (unsigned)m->m_len);
1610                         n->m_len += m->m_len;
1611                         sb->sb_cc += m->m_len;
1612                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1613                             m->m_type != MT_OOBDATA) {
1614                                 /* XXX: Probably don't need */
1615                                 sb->sb_ctl += m->m_len;
1616                         }
1617
1618                         /* update send byte count */
1619                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
1620                                 inp_incr_sndbytes_total(sb->sb_so,
1621                                     m->m_len);
1622                                 inp_incr_sndbytes_unsent(sb->sb_so,
1623                                     m->m_len);
1624                         }
1625                         m = m_free(m);
1626                         continue;
1627                 }
1628                 if (n != NULL) {
1629                         n->m_next = m;
1630                 } else {
1631                         sb->sb_mb = m;
1632                 }
1633                 sb->sb_mbtail = m;
1634                 sballoc(sb, m);
1635                 n = m;
1636                 m->m_flags &= ~M_EOR;
1637                 m = m->m_next;
1638                 n->m_next = NULL;
1639         }
1640         if (eor != 0) {
1641                 if (n != NULL) {
1642                         n->m_flags |= eor;
1643                 } else {
1644                         printf("semi-panic: sbcompress\n");
1645                 }
1646         }
1647 done:
1648         SBLASTMBUFCHK(sb, __func__);
1649 }
1650
1651 void
1652 sb_empty_assert(struct sockbuf *sb, const char *where)
1653 {
1654         if (!(sb->sb_cc == 0 && sb->sb_mb == NULL && sb->sb_mbcnt == 0 &&
1655             sb->sb_mbtail == NULL && sb->sb_lastrecord == NULL)) {
1656                 panic("%s: sb %p so %p cc %d mbcnt %d mb %p mbtail %p "
1657                     "lastrecord %p\n", where, sb, sb->sb_so, sb->sb_cc,
1658                     sb->sb_mbcnt, sb->sb_mb, sb->sb_mbtail,
1659                     sb->sb_lastrecord);
1660                 /* NOTREACHED */
1661         }
1662 }
1663
1664 /*
1665  * Free all mbufs in a sockbuf.
1666  * Check that all resources are reclaimed.
1667  */
1668 void
1669 sbflush(struct sockbuf *sb)
1670 {
1671         void *lr_saved = __builtin_return_address(0);
1672         struct socket *so = sb->sb_so;
1673
1674         /* so_usecount may be 0 if we get here from sofreelastref() */
1675         if (so == NULL) {
1676                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
1677                     __func__, sb, sb->sb_flags, lr_saved);
1678                 /* NOTREACHED */
1679         } else if (so->so_usecount < 0) {
1680                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
1681                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
1682                     so->so_usecount, lr_saved, solockhistory_nr(so));
1683                 /* NOTREACHED */
1684         }
1685
1686         /*
1687          * Obtain lock on the socket buffer (SB_LOCK).  This is required
1688          * to prevent the socket buffer from being unexpectedly altered
1689          * while it is used by another thread in socket send/receive.
1690          *
1691          * sblock() must not fail here, hence the assertion.
1692          */
1693         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
1694         VERIFY(sb->sb_flags & SB_LOCK);
1695
1696         while (sb->sb_mbcnt > 0) {
1697                 /*
1698                  * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1699                  * we would loop forever. Panic instead.
1700                  */
1701                 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) {
1702                         break;
1703                 }
1704                 sbdrop(sb, (int)sb->sb_cc);
1705         }
1706
1707         sb_empty_assert(sb, __func__);
1708         sbunlock(sb, TRUE);     /* keep socket locked */
1709 }
1710
1711 /*
1712  * Drop data from (the front of) a sockbuf.
1713  * use m_freem_list to free the mbuf structures
1714  * under a single lock... this is done by pruning
1715  * the top of the tree from the body by keeping track
1716  * of where we get to in the tree and then zeroing the
1717  * two pertinent pointers m_nextpkt and m_next
1718  * the socket buffer is then updated to point at the new
1719  * top of the tree and the pruned area is released via
1720  * m_freem_list.
1721  */
1722 void
1723 sbdrop(struct sockbuf *sb, int len)
1724 {
1725         struct mbuf *m, *free_list, *ml;
1726         struct mbuf *next, *last;
1727
1728         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1729 #if MPTCP
1730         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
1731             ((sb->sb_so->so_flags & SOF_MP_SUBFLOW) ||
1732             (SOCK_CHECK_DOM(sb->sb_so, PF_MULTIPATH) &&
1733             SOCK_CHECK_PROTO(sb->sb_so, IPPROTO_TCP))) &&
1734             !(sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
1735                 mptcp_preproc_sbdrop(sb->sb_so, m, (unsigned int)len);
1736         }
1737         if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
1738             (sb->sb_so->so_flags & SOF_MP_SUBFLOW) &&
1739             (sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
1740                 mptcp_fallback_sbdrop(sb->sb_so, m, len);
1741         }
1742 #endif /* MPTCP */
1743         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
1744
1745         free_list = last = m;
1746         ml = (struct mbuf *)0;
1747
1748         while (len > 0) {
1749                 if (m == NULL) {
1750                         if (next == NULL) {
1751                                 /*
1752                                  * temporarily replacing this panic with printf
1753                                  * because it occurs occasionally when closing
1754                                  * a socket when there is no harm in ignoring
1755                                  * it. This problem will be investigated
1756                                  * further.
1757                                  */
1758                                 /* panic("sbdrop"); */
1759                                 printf("sbdrop - count not zero\n");
1760                                 len = 0;
1761                                 /*
1762                                  * zero the counts. if we have no mbufs,
1763                                  * we have no data (PR-2986815)
1764                                  */
1765                                 sb->sb_cc = 0;
1766                                 sb->sb_mbcnt = 0;
1767                                 break;
1768                         }
1769                         m = last = next;
1770                         next = m->m_nextpkt;
1771                         continue;
1772                 }
1773                 if (m->m_len > len) {
1774                         m->m_len -= len;
1775                         m->m_data += len;
1776                         sb->sb_cc -= len;
1777                         /* update the send byte count */
1778                         if (sb->sb_flags & SB_SNDBYTE_CNT) {
1779                                 inp_decr_sndbytes_total(sb->sb_so, len);
1780                         }
1781                         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1782                             m->m_type != MT_OOBDATA) {
1783                                 sb->sb_ctl -= len;
1784                         }
1785                         break;
1786                 }
1787                 len -= m->m_len;
1788                 sbfree(sb, m);
1789
1790                 ml = m;
1791                 m = m->m_next;
1792         }
1793         while (m && m->m_len == 0) {
1794                 sbfree(sb, m);
1795
1796                 ml = m;
1797                 m = m->m_next;
1798         }
1799         if (ml) {
1800                 ml->m_next = (struct mbuf *)0;
1801                 last->m_nextpkt = (struct mbuf *)0;
1802                 m_freem_list(free_list);
1803         }
1804         if (m) {
1805                 sb->sb_mb = m;
1806                 m->m_nextpkt = next;
1807         } else {
1808                 sb->sb_mb = next;
1809         }
1810
1811         /*
1812          * First part is an inline SB_EMPTY_FIXUP().  Second part
1813          * makes sure sb_lastrecord is up-to-date if we dropped
1814          * part of the last record.
1815          */
1816         m = sb->sb_mb;
1817         if (m == NULL) {
1818                 sb->sb_mbtail = NULL;
1819                 sb->sb_lastrecord = NULL;
1820         } else if (m->m_nextpkt == NULL) {
1821                 sb->sb_lastrecord = m;
1822         }
1823
1824 #if CONTENT_FILTER
1825         cfil_sock_buf_update(sb);
1826 #endif /* CONTENT_FILTER */
1827
1828         KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
1829 }
1830
1831 /*
1832  * Drop a record off the front of a sockbuf
1833  * and move the next record to the front.
1834  */
1835 void
1836 sbdroprecord(struct sockbuf *sb)
1837 {
1838         struct mbuf *m, *mn;
1839
1840         m = sb->sb_mb;
1841         if (m) {
1842                 sb->sb_mb = m->m_nextpkt;
1843                 do {
1844                         sbfree(sb, m);
1845                         MFREE(m, mn);
1846                         m = mn;
1847                 } while (m);
1848         }
1849         SB_EMPTY_FIXUP(sb);
1850 }
1851
1852 /*
1853  * Create a "control" mbuf containing the specified data
1854  * with the specified type for presentation on a socket buffer.
1855  */
1856 struct mbuf *
1857 sbcreatecontrol(caddr_t p, int size, int type, int level)
1858 {
1859         struct cmsghdr *cp;
1860         struct mbuf *m;
1861
1862         if (CMSG_SPACE((u_int)size) > MLEN) {
1863                 return (struct mbuf *)NULL;
1864         }
1865         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) {
1866                 return (struct mbuf *)NULL;
1867         }
1868         cp = mtod(m, struct cmsghdr *);
1869         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1870         /* XXX check size? */
1871         (void) memcpy(CMSG_DATA(cp), p, size);
1872         m->m_len = (int32_t)CMSG_SPACE(size);
1873         cp->cmsg_len = CMSG_LEN(size);
1874         cp->cmsg_level = level;
1875         cp->cmsg_type = type;
1876         return m;
1877 }
1878
1879 struct mbuf **
1880 sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf **mp)
1881 {
1882         struct mbuf *m;
1883         struct cmsghdr *cp;
1884
1885         if (*mp == NULL) {
1886                 *mp = sbcreatecontrol(p, size, type, level);
1887                 return mp;
1888         }
1889
1890         if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN) {
1891                 mp = &(*mp)->m_next;
1892                 *mp = sbcreatecontrol(p, size, type, level);
1893                 return mp;
1894         }
1895
1896         m = *mp;
1897
1898         cp = (struct cmsghdr *)(void *)(mtod(m, char *) + m->m_len);
1899         /* CMSG_SPACE ensures 32-bit alignment */
1900         VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1901         m->m_len += (int32_t)CMSG_SPACE(size);
1902
1903         /* XXX check size? */
1904         (void) memcpy(CMSG_DATA(cp), p, size);
1905         cp->cmsg_len = CMSG_LEN(size);
1906         cp->cmsg_level = level;
1907         cp->cmsg_type = type;
1908
1909         return mp;
1910 }
1911
1912
1913 /*
1914  * Some routines that return EOPNOTSUPP for entry points that are not
1915  * supported by a protocol.  Fill in as needed.
1916  */
1917 int
1918 pru_abort_notsupp(struct socket *so)
1919 {
1920 #pragma unused(so)
1921         return EOPNOTSUPP;
1922 }
1923
1924 int
1925 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
1926 {
1927 #pragma unused(so, nam)
1928         return EOPNOTSUPP;
1929 }
1930
1931 int
1932 pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
1933 {
1934 #pragma unused(so, proto, p)
1935         return EOPNOTSUPP;
1936 }
1937
1938 int
1939 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1940 {
1941 #pragma unused(so, nam, p)
1942         return EOPNOTSUPP;
1943 }
1944
1945 int
1946 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
1947 {
1948 #pragma unused(so, nam, p)
1949         return EOPNOTSUPP;
1950 }
1951
1952 int
1953 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
1954 {
1955 #pragma unused(so1, so2)
1956         return EOPNOTSUPP;
1957 }
1958
1959 int
1960 pru_connectx_notsupp(struct socket *so, struct sockaddr *src,
1961     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1962     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1963     uint32_t arglen, struct uio *uio, user_ssize_t *bytes_written)
1964 {
1965 #pragma unused(so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written)
1966         return EOPNOTSUPP;
1967 }
1968
1969 int
1970 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
1971     struct ifnet *ifp, struct proc *p)
1972 {
1973 #pragma unused(so, cmd, data, ifp, p)
1974         return EOPNOTSUPP;
1975 }
1976
1977 int
1978 pru_detach_notsupp(struct socket *so)
1979 {
1980 #pragma unused(so)
1981         return EOPNOTSUPP;
1982 }
1983
1984 int
1985 pru_disconnect_notsupp(struct socket *so)
1986 {
1987 #pragma unused(so)
1988         return EOPNOTSUPP;
1989 }
1990
1991 int
1992 pru_disconnectx_notsupp(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1993 {
1994 #pragma unused(so, aid, cid)
1995         return EOPNOTSUPP;
1996 }
1997
1998 int
1999 pru_listen_notsupp(struct socket *so, struct proc *p)
2000 {
2001 #pragma unused(so, p)
2002         return EOPNOTSUPP;
2003 }
2004
2005 int
2006 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2007 {
2008 #pragma unused(so, nam)
2009         return EOPNOTSUPP;
2010 }
2011
2012 int
2013 pru_rcvd_notsupp(struct socket *so, int flags)
2014 {
2015 #pragma unused(so, flags)
2016         return EOPNOTSUPP;
2017 }
2018
2019 int
2020 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2021 {
2022 #pragma unused(so, m, flags)
2023         return EOPNOTSUPP;
2024 }
2025
2026 int
2027 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2028     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2029 {
2030 #pragma unused(so, flags, m, addr, control, p)
2031         return EOPNOTSUPP;
2032 }
2033
2034 int
2035 pru_send_list_notsupp(struct socket *so, int flags, struct mbuf *m,
2036     struct sockaddr *addr, struct mbuf *control, struct proc *p)
2037 {
2038 #pragma unused(so, flags, m, addr, control, p)
2039         return EOPNOTSUPP;
2040 }
2041
2042 /*
2043  * This isn't really a ``null'' operation, but it's the default one
2044  * and doesn't do anything destructive.
2045  */
2046 int
2047 pru_sense_null(struct socket *so, void *ub, int isstat64)
2048 {
2049         if (isstat64 != 0) {
2050                 struct stat64 *sb64;
2051
2052                 sb64 = (struct stat64 *)ub;
2053                 sb64->st_blksize = so->so_snd.sb_hiwat;
2054         } else {
2055                 struct stat *sb;
2056
2057                 sb = (struct stat *)ub;
2058                 sb->st_blksize = so->so_snd.sb_hiwat;
2059         }
2060
2061         return 0;
2062 }
2063
2064
2065 int
2066 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2067     struct mbuf *top, struct mbuf *control, int flags)
2068 {
2069 #pragma unused(so, addr, uio, top, control, flags)
2070         return EOPNOTSUPP;
2071 }
2072
2073 int
2074 pru_sosend_list_notsupp(struct socket *so, struct uio **uio,
2075     u_int uiocnt, int flags)
2076 {
2077 #pragma unused(so, uio, uiocnt, flags)
2078         return EOPNOTSUPP;
2079 }
2080
2081 int
2082 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2083     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2084 {
2085 #pragma unused(so, paddr, uio, mp0, controlp, flagsp)
2086         return EOPNOTSUPP;
2087 }
2088
2089 int
2090 pru_soreceive_list_notsupp(struct socket *so,
2091     struct recv_msg_elem *recv_msg_array, u_int uiocnt, int *flagsp)
2092 {
2093 #pragma unused(so, recv_msg_array, uiocnt, flagsp)
2094         return EOPNOTSUPP;
2095 }
2096
2097 int
2098 pru_shutdown_notsupp(struct socket *so)
2099 {
2100 #pragma unused(so)
2101         return EOPNOTSUPP;
2102 }
2103
2104 int
2105 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2106 {
2107 #pragma unused(so, nam)
2108         return EOPNOTSUPP;
2109 }
2110
2111 int
2112 pru_sopoll_notsupp(struct socket *so, int events, kauth_cred_t cred, void *wql)
2113 {
2114 #pragma unused(so, events, cred, wql)
2115         return EOPNOTSUPP;
2116 }
2117
2118 int
2119 pru_socheckopt_null(struct socket *so, struct sockopt *sopt)
2120 {
2121 #pragma unused(so, sopt)
2122         /*
2123          * Allow all options for set/get by default.
2124          */
2125         return 0;
2126 }
2127
2128 static int
2129 pru_preconnect_null(struct socket *so)
2130 {
2131 #pragma unused(so)
2132         return 0;
2133 }
2134
2135 void
2136 pru_sanitize(struct pr_usrreqs *pru)
2137 {
2138 #define DEFAULT(foo, bar)       if ((foo) == NULL) (foo) = (bar)
2139         DEFAULT(pru->pru_abort, pru_abort_notsupp);
2140         DEFAULT(pru->pru_accept, pru_accept_notsupp);
2141         DEFAULT(pru->pru_attach, pru_attach_notsupp);
2142         DEFAULT(pru->pru_bind, pru_bind_notsupp);
2143         DEFAULT(pru->pru_connect, pru_connect_notsupp);
2144         DEFAULT(pru->pru_connect2, pru_connect2_notsupp);
2145         DEFAULT(pru->pru_connectx, pru_connectx_notsupp);
2146         DEFAULT(pru->pru_control, pru_control_notsupp);
2147         DEFAULT(pru->pru_detach, pru_detach_notsupp);
2148         DEFAULT(pru->pru_disconnect, pru_disconnect_notsupp);
2149         DEFAULT(pru->pru_disconnectx, pru_disconnectx_notsupp);
2150         DEFAULT(pru->pru_listen, pru_listen_notsupp);
2151         DEFAULT(pru->pru_peeraddr, pru_peeraddr_notsupp);
2152         DEFAULT(pru->pru_rcvd, pru_rcvd_notsupp);
2153         DEFAULT(pru->pru_rcvoob, pru_rcvoob_notsupp);
2154         DEFAULT(pru->pru_send, pru_send_notsupp);
2155         DEFAULT(pru->pru_send_list, pru_send_list_notsupp);
2156         DEFAULT(pru->pru_sense, pru_sense_null);
2157         DEFAULT(pru->pru_shutdown, pru_shutdown_notsupp);
2158         DEFAULT(pru->pru_sockaddr, pru_sockaddr_notsupp);
2159         DEFAULT(pru->pru_sopoll, pru_sopoll_notsupp);
2160         DEFAULT(pru->pru_soreceive, pru_soreceive_notsupp);
2161         DEFAULT(pru->pru_soreceive_list, pru_soreceive_list_notsupp);
2162         DEFAULT(pru->pru_sosend, pru_sosend_notsupp);
2163         DEFAULT(pru->pru_sosend_list, pru_sosend_list_notsupp);
2164         DEFAULT(pru->pru_socheckopt, pru_socheckopt_null);
2165         DEFAULT(pru->pru_preconnect, pru_preconnect_null);
2166 #undef DEFAULT
2167 }
2168
2169 /*
2170  * The following are macros on BSD and functions on Darwin
2171  */
2172
2173 /*
2174  * Do we need to notify the other side when I/O is possible?
2175  */
2176
2177 int
2178 sb_notify(struct sockbuf *sb)
2179 {
2180         return sb->sb_waiters > 0 ||
2181                (sb->sb_flags & (SB_SEL | SB_ASYNC | SB_UPCALL | SB_KNOTE));
2182 }
2183
2184 /*
2185  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
2186  * This is problematical if the fields are unsigned, as the space might
2187  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
2188  * overflow and return 0.
2189  */
2190 int
2191 sbspace(struct sockbuf *sb)
2192 {
2193         int pending = 0;
2194         int space = imin((int)(sb->sb_hiwat - sb->sb_cc),
2195             (int)(sb->sb_mbmax - sb->sb_mbcnt));
2196
2197         if (sb->sb_preconn_hiwat != 0) {
2198                 space = imin((int)(sb->sb_preconn_hiwat - sb->sb_cc), space);
2199         }
2200
2201         if (space < 0) {
2202                 space = 0;
2203         }
2204
2205         /* Compensate for data being processed by content filters */
2206 #if CONTENT_FILTER
2207         pending = cfil_sock_data_space(sb);
2208 #endif /* CONTENT_FILTER */
2209         if (pending > space) {
2210                 space = 0;
2211         } else {
2212                 space -= pending;
2213         }
2214
2215         return space;
2216 }
2217
2218 /* do we have to send all at once on a socket? */
2219 int
2220 sosendallatonce(struct socket *so)
2221 {
2222         return so->so_proto->pr_flags & PR_ATOMIC;
2223 }
2224
2225 /* can we read something from so? */
2226 int
2227 soreadable(struct socket *so)
2228 {
2229         return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2230                ((so->so_state & SS_CANTRCVMORE)
2231 #if CONTENT_FILTER
2232                && cfil_sock_data_pending(&so->so_rcv) == 0
2233 #endif /* CONTENT_FILTER */
2234                ) ||
2235                so->so_comp.tqh_first || so->so_error;
2236 }
2237
2238 /* can we write something to so? */
2239
2240 int
2241 sowriteable(struct socket *so)
2242 {
2243         if ((so->so_state & SS_CANTSENDMORE) ||
2244             so->so_error > 0) {
2245                 return 1;
2246         }
2247         if (so_wait_for_if_feedback(so) || !socanwrite(so)) {
2248                 return 0;
2249         }
2250         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2251                 return 1;
2252         }
2253
2254         if (sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat) {
2255                 if (so->so_flags & SOF_NOTSENT_LOWAT) {
2256                         if ((SOCK_DOM(so) == PF_INET6 ||
2257                             SOCK_DOM(so) == PF_INET) &&
2258                             so->so_type == SOCK_STREAM) {
2259                                 return tcp_notsent_lowat_check(so);
2260                         }
2261 #if MPTCP
2262                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
2263                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
2264                                 return mptcp_notsent_lowat_check(so);
2265                         }
2266 #endif
2267                         else {
2268                                 return 1;
2269                         }
2270                 } else {
2271                         return 1;
2272                 }
2273         }
2274         return 0;
2275 }
2276
2277 /* adjust counters in sb reflecting allocation of m */
2278
2279 void
2280 sballoc(struct sockbuf *sb, struct mbuf *m)
2281 {
2282         u_int32_t cnt = 1;
2283         sb->sb_cc += m->m_len;
2284         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2285             m->m_type != MT_OOBDATA) {
2286                 sb->sb_ctl += m->m_len;
2287         }
2288         sb->sb_mbcnt += MSIZE;
2289
2290         if (m->m_flags & M_EXT) {
2291                 sb->sb_mbcnt += m->m_ext.ext_size;
2292                 cnt += (m->m_ext.ext_size >> MSIZESHIFT);
2293         }
2294         OSAddAtomic(cnt, &total_sbmb_cnt);
2295         VERIFY(total_sbmb_cnt > 0);
2296         if (total_sbmb_cnt > total_sbmb_cnt_peak) {
2297                 total_sbmb_cnt_peak = total_sbmb_cnt;
2298         }
2299
2300         /*
2301          * If data is being added to the send socket buffer,
2302          * update the send byte count
2303          */
2304         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2305                 inp_incr_sndbytes_total(sb->sb_so, m->m_len);
2306                 inp_incr_sndbytes_unsent(sb->sb_so, m->m_len);
2307         }
2308 }
2309
2310 /* adjust counters in sb reflecting freeing of m */
2311 void
2312 sbfree(struct sockbuf *sb, struct mbuf *m)
2313 {
2314         int cnt = -1;
2315
2316         sb->sb_cc -= m->m_len;
2317         if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2318             m->m_type != MT_OOBDATA) {
2319                 sb->sb_ctl -= m->m_len;
2320         }
2321         sb->sb_mbcnt -= MSIZE;
2322         if (m->m_flags & M_EXT) {
2323                 sb->sb_mbcnt -= m->m_ext.ext_size;
2324                 cnt -= (m->m_ext.ext_size >> MSIZESHIFT);
2325         }
2326         OSAddAtomic(cnt, &total_sbmb_cnt);
2327         VERIFY(total_sbmb_cnt >= 0);
2328         if (total_sbmb_cnt < total_sbmb_cnt_floor) {
2329                 total_sbmb_cnt_floor = total_sbmb_cnt;
2330         }
2331
2332         /*
2333          * If data is being removed from the send socket buffer,
2334          * update the send byte count
2335          */
2336         if (sb->sb_flags & SB_SNDBYTE_CNT) {
2337                 inp_decr_sndbytes_total(sb->sb_so, m->m_len);
2338         }
2339 }
2340
2341 /*
2342  * Set lock on sockbuf sb; sleep if lock is already held.
2343  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
2344  * Returns error without lock if sleep is interrupted.
2345  */
2346 int
2347 sblock(struct sockbuf *sb, uint32_t flags)
2348 {
2349         boolean_t nointr = ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR));
2350         void *lr_saved = __builtin_return_address(0);
2351         struct socket *so = sb->sb_so;
2352         void * wchan;
2353         int error = 0;
2354         thread_t tp = current_thread();
2355
2356         VERIFY((flags & SBL_VALID) == flags);
2357
2358         /* so_usecount may be 0 if we get here from sofreelastref() */
2359         if (so == NULL) {
2360                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2361                     __func__, sb, sb->sb_flags, lr_saved);
2362                 /* NOTREACHED */
2363         } else if (so->so_usecount < 0) {
2364                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2365                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2366                     so->so_usecount, lr_saved, solockhistory_nr(so));
2367                 /* NOTREACHED */
2368         }
2369
2370         /*
2371          * The content filter thread must hold the sockbuf lock
2372          */
2373         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2374                 /*
2375                  * Don't panic if we are defunct because SB_LOCK has
2376                  * been cleared by sodefunct()
2377                  */
2378                 if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK)) {
2379                         panic("%s: SB_LOCK not held for %p\n",
2380                             __func__, sb);
2381                 }
2382
2383                 /* Keep the sockbuf locked */
2384                 return 0;
2385         }
2386
2387         if ((sb->sb_flags & SB_LOCK) && !(flags & SBL_WAIT)) {
2388                 return EWOULDBLOCK;
2389         }
2390         /*
2391          * We may get here from sorflush(), in which case "sb" may not
2392          * point to the real socket buffer.  Use the actual socket buffer
2393          * address from the socket instead.
2394          */
2395         wchan = (sb->sb_flags & SB_RECV) ?
2396             &so->so_rcv.sb_flags : &so->so_snd.sb_flags;
2397
2398         /*
2399          * A content filter thread has exclusive access to the sockbuf
2400          * until it clears the
2401          */
2402         while ((sb->sb_flags & SB_LOCK) ||
2403             ((so->so_flags & SOF_CONTENT_FILTER) &&
2404             sb->sb_cfil_thread != NULL)) {
2405                 lck_mtx_t *mutex_held;
2406
2407                 /*
2408                  * XXX: This code should be moved up above outside of this loop;
2409                  * however, we may get here as part of sofreelastref(), and
2410                  * at that time pr_getlock() may no longer be able to return
2411                  * us the lock.  This will be fixed in future.
2412                  */
2413                 if (so->so_proto->pr_getlock != NULL) {
2414                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2415                 } else {
2416                         mutex_held = so->so_proto->pr_domain->dom_mtx;
2417                 }
2418
2419                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2420
2421                 sb->sb_wantlock++;
2422                 VERIFY(sb->sb_wantlock != 0);
2423
2424                 error = msleep(wchan, mutex_held,
2425                     nointr ? PSOCK : PSOCK | PCATCH,
2426                     nointr ? "sb_lock_nointr" : "sb_lock", NULL);
2427
2428                 VERIFY(sb->sb_wantlock != 0);
2429                 sb->sb_wantlock--;
2430
2431                 if (error == 0 && (so->so_flags & SOF_DEFUNCT) &&
2432                     !(flags & SBL_IGNDEFUNCT)) {
2433                         error = EBADF;
2434                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
2435                             "(%d)\n", __func__, proc_selfpid(),
2436                             proc_best_name(current_proc()),
2437                             (uint64_t)VM_KERNEL_ADDRPERM(so),
2438                             SOCK_DOM(so), SOCK_TYPE(so), error);
2439                 }
2440
2441                 if (error != 0) {
2442                         return error;
2443                 }
2444         }
2445         sb->sb_flags |= SB_LOCK;
2446         return 0;
2447 }
2448
2449 /*
2450  * Release lock on sockbuf sb
2451  */
2452 void
2453 sbunlock(struct sockbuf *sb, boolean_t keeplocked)
2454 {
2455         void *lr_saved = __builtin_return_address(0);
2456         struct socket *so = sb->sb_so;
2457         thread_t tp = current_thread();
2458
2459         /* so_usecount may be 0 if we get here from sofreelastref() */
2460         if (so == NULL) {
2461                 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2462                     __func__, sb, sb->sb_flags, lr_saved);
2463                 /* NOTREACHED */
2464         } else if (so->so_usecount < 0) {
2465                 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2466                     "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2467                     so->so_usecount, lr_saved, solockhistory_nr(so));
2468                 /* NOTREACHED */
2469         }
2470
2471         /*
2472          * The content filter thread must hold the sockbuf lock
2473          */
2474         if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2475                 /*
2476                  * Don't panic if we are defunct because SB_LOCK has
2477                  * been cleared by sodefunct()
2478                  */
2479                 if (!(so->so_flags & SOF_DEFUNCT) &&
2480                     !(sb->sb_flags & SB_LOCK) &&
2481                     !(so->so_state & SS_DEFUNCT) &&
2482                     !(so->so_flags1 & SOF1_DEFUNCTINPROG)) {
2483                         panic("%s: SB_LOCK not held for %p\n",
2484                             __func__, sb);
2485                 }
2486                 /* Keep the sockbuf locked and proceed */
2487         } else {
2488                 VERIFY((sb->sb_flags & SB_LOCK) ||
2489                     (so->so_state & SS_DEFUNCT) ||
2490                     (so->so_flags1 & SOF1_DEFUNCTINPROG));
2491
2492                 sb->sb_flags &= ~SB_LOCK;
2493
2494                 if (sb->sb_wantlock > 0) {
2495                         /*
2496                          * We may get here from sorflush(), in which case "sb"
2497                          * may not point to the real socket buffer.  Use the
2498                          * actual socket buffer address from the socket instead.
2499                          */
2500                         wakeup((sb->sb_flags & SB_RECV) ? &so->so_rcv.sb_flags :
2501                             &so->so_snd.sb_flags);
2502                 }
2503         }
2504
2505         if (!keeplocked) {      /* unlock on exit */
2506                 if (so->so_flags & SOF_MP_SUBFLOW || SOCK_DOM(so) == PF_MULTIPATH) {
2507                         (*so->so_proto->pr_unlock)(so, 1, lr_saved);
2508                 } else {
2509                         lck_mtx_t *mutex_held;
2510
2511                         if (so->so_proto->pr_getlock != NULL) {
2512                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2513                         } else {
2514                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2515                         }
2516
2517                         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2518
2519                         VERIFY(so->so_usecount > 0);
2520                         so->so_usecount--;
2521                         so->unlock_lr[so->next_unlock_lr] = lr_saved;
2522                         so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
2523                         lck_mtx_unlock(mutex_held);
2524                 }
2525         }
2526 }
2527
2528 void
2529 sorwakeup(struct socket *so)
2530 {
2531         if (sb_notify(&so->so_rcv)) {
2532                 sowakeup(so, &so->so_rcv, NULL);
2533         }
2534 }
2535
2536 void
2537 sowwakeup(struct socket *so)
2538 {
2539         if (sb_notify(&so->so_snd)) {
2540                 sowakeup(so, &so->so_snd, NULL);
2541         }
2542 }
2543
2544 void
2545 soevent(struct socket *so, long hint)
2546 {
2547         if (so->so_flags & SOF_KNOTE) {
2548                 KNOTE(&so->so_klist, hint);
2549         }
2550
2551         soevupcall(so, hint);
2552
2553         /*
2554          * Don't post an event if this a subflow socket or
2555          * the app has opted out of using cellular interface
2556          */
2557         if ((hint & SO_FILT_HINT_IFDENIED) &&
2558             !(so->so_flags & SOF_MP_SUBFLOW) &&
2559             !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) &&
2560             !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE) &&
2561             !(so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
2562                 soevent_ifdenied(so);
2563         }
2564 }
2565
2566 void
2567 soevupcall(struct socket *so, long hint)
2568 {
2569         if (so->so_event != NULL) {
2570                 caddr_t so_eventarg = so->so_eventarg;
2571
2572                 hint &= so->so_eventmask;
2573                 if (hint != 0) {
2574                         so->so_event(so, so_eventarg, hint);
2575                 }
2576         }
2577 }
2578
2579 static void
2580 soevent_ifdenied(struct socket *so)
2581 {
2582         struct kev_netpolicy_ifdenied ev_ifdenied;
2583
2584         bzero(&ev_ifdenied, sizeof(ev_ifdenied));
2585         /*
2586          * The event consumer is interested about the effective {upid,pid,uuid}
2587          * info which can be different than the those related to the process
2588          * that recently performed a system call on the socket, i.e. when the
2589          * socket is delegated.
2590          */
2591         if (so->so_flags & SOF_DELEGATED) {
2592                 ev_ifdenied.ev_data.eupid = so->e_upid;
2593                 ev_ifdenied.ev_data.epid = so->e_pid;
2594                 uuid_copy(ev_ifdenied.ev_data.euuid, so->e_uuid);
2595         } else {
2596                 ev_ifdenied.ev_data.eupid = so->last_upid;
2597                 ev_ifdenied.ev_data.epid = so->last_pid;
2598                 uuid_copy(ev_ifdenied.ev_data.euuid, so->last_uuid);
2599         }
2600
2601         if (++so->so_ifdenied_notifies > 1) {
2602                 /*
2603                  * Allow for at most one kernel event to be generated per
2604                  * socket; so_ifdenied_notifies is reset upon changes in
2605                  * the UUID policy.  See comments in inp_update_policy.
2606                  */
2607                 if (net_io_policy_log) {
2608                         uuid_string_t buf;
2609
2610                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2611                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %llu "
2612                             "euuid %s%s has %d redundant events supressed\n",
2613                             __func__, so->last_pid,
2614                             (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
2615                             SOCK_TYPE(so), ev_ifdenied.ev_data.epid, buf,
2616                             ((so->so_flags & SOF_DELEGATED) ?
2617                             " [delegated]" : ""), so->so_ifdenied_notifies);
2618                 }
2619         } else {
2620                 if (net_io_policy_log) {
2621                         uuid_string_t buf;
2622
2623                         uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2624                         log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %llu "
2625                             "euuid %s%s event posted\n", __func__,
2626                             so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
2627                             SOCK_DOM(so), SOCK_TYPE(so),
2628                             ev_ifdenied.ev_data.epid, buf,
2629                             ((so->so_flags & SOF_DELEGATED) ?
2630                             " [delegated]" : ""));
2631                 }
2632                 netpolicy_post_msg(KEV_NETPOLICY_IFDENIED, &ev_ifdenied.ev_data,
2633                     sizeof(ev_ifdenied));
2634         }
2635 }
2636
2637 /*
2638  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2639  */
2640 struct sockaddr *
2641 dup_sockaddr(struct sockaddr *sa, int canwait)
2642 {
2643         struct sockaddr *sa2;
2644
2645         MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
2646             canwait ? M_WAITOK : M_NOWAIT);
2647         if (sa2) {
2648                 bcopy(sa, sa2, sa->sa_len);
2649         }
2650         return sa2;
2651 }
2652
2653 /*
2654  * Create an external-format (``xsocket'') structure using the information
2655  * in the kernel-format socket structure pointed to by so.  This is done
2656  * to reduce the spew of irrelevant information over this interface,
2657  * to isolate user code from changes in the kernel structure, and
2658  * potentially to provide information-hiding if we decide that
2659  * some of this information should be hidden from users.
2660  */
2661 void
2662 sotoxsocket(struct socket *so, struct xsocket *xso)
2663 {
2664         xso->xso_len = sizeof(*xso);
2665         xso->xso_so = (_XSOCKET_PTR(struct socket *))VM_KERNEL_ADDRPERM(so);
2666         xso->so_type = so->so_type;
2667         xso->so_options = (short)(so->so_options & 0xffff);
2668         xso->so_linger = so->so_linger;
2669         xso->so_state = so->so_state;
2670         xso->so_pcb = (_XSOCKET_PTR(caddr_t))VM_KERNEL_ADDRPERM(so->so_pcb);
2671         if (so->so_proto) {
2672                 xso->xso_protocol = SOCK_PROTO(so);
2673                 xso->xso_family = SOCK_DOM(so);
2674         } else {
2675                 xso->xso_protocol = xso->xso_family = 0;
2676         }
2677         xso->so_qlen = so->so_qlen;
2678         xso->so_incqlen = so->so_incqlen;
2679         xso->so_qlimit = so->so_qlimit;
2680         xso->so_timeo = so->so_timeo;
2681         xso->so_error = so->so_error;
2682         xso->so_pgid = so->so_pgid;
2683         xso->so_oobmark = so->so_oobmark;
2684         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2685         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2686         xso->so_uid = kauth_cred_getuid(so->so_cred);
2687 }
2688
2689
2690 #if XNU_TARGET_OS_OSX
2691
2692 void
2693 sotoxsocket64(struct socket *so, struct xsocket64 *xso)
2694 {
2695         xso->xso_len = sizeof(*xso);
2696         xso->xso_so = (u_int64_t)VM_KERNEL_ADDRPERM(so);
2697         xso->so_type = so->so_type;
2698         xso->so_options = (short)(so->so_options & 0xffff);
2699         xso->so_linger = so->so_linger;
2700         xso->so_state = so->so_state;
2701         xso->so_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
2702         if (so->so_proto) {
2703                 xso->xso_protocol = SOCK_PROTO(so);
2704                 xso->xso_family = SOCK_DOM(so);
2705         } else {
2706                 xso->xso_protocol = xso->xso_family = 0;
2707         }
2708         xso->so_qlen = so->so_qlen;
2709         xso->so_incqlen = so->so_incqlen;
2710         xso->so_qlimit = so->so_qlimit;
2711         xso->so_timeo = so->so_timeo;
2712         xso->so_error = so->so_error;
2713         xso->so_pgid = so->so_pgid;
2714         xso->so_oobmark = so->so_oobmark;
2715         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2716         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2717         xso->so_uid = kauth_cred_getuid(so->so_cred);
2718 }
2719
2720 #endif /* XNU_TARGET_OS_OSX */
2721
2722 /*
2723  * This does the same for sockbufs.  Note that the xsockbuf structure,
2724  * since it is always embedded in a socket, does not include a self
2725  * pointer nor a length.  We make this entry point public in case
2726  * some other mechanism needs it.
2727  */
2728 void
2729 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
2730 {
2731         xsb->sb_cc = sb->sb_cc;
2732         xsb->sb_hiwat = sb->sb_hiwat;
2733         xsb->sb_mbcnt = sb->sb_mbcnt;
2734         xsb->sb_mbmax = sb->sb_mbmax;
2735         xsb->sb_lowat = sb->sb_lowat;
2736         xsb->sb_flags = (short)sb->sb_flags;
2737         xsb->sb_timeo = (short)
2738             ((sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick);
2739         if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) {
2740                 xsb->sb_timeo = 1;
2741         }
2742 }
2743
2744 /*
2745  * Based on the policy set by an all knowing decison maker, throttle sockets
2746  * that either have been marked as belonging to "background" process.
2747  */
2748 inline int
2749 soisthrottled(struct socket *so)
2750 {
2751         return so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND;
2752 }
2753
2754 inline int
2755 soisprivilegedtraffic(struct socket *so)
2756 {
2757         return (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS) ? 1 : 0;
2758 }
2759
2760 inline int
2761 soissrcbackground(struct socket *so)
2762 {
2763         return (so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND) ||
2764                IS_SO_TC_BACKGROUND(so->so_traffic_class);
2765 }
2766
2767 inline int
2768 soissrcrealtime(struct socket *so)
2769 {
2770         return so->so_traffic_class >= SO_TC_AV &&
2771                so->so_traffic_class <= SO_TC_VO;
2772 }
2773
2774 inline int
2775 soissrcbesteffort(struct socket *so)
2776 {
2777         return so->so_traffic_class == SO_TC_BE ||
2778                so->so_traffic_class == SO_TC_RD ||
2779                so->so_traffic_class == SO_TC_OAM;
2780 }
2781
2782 void
2783 soclearfastopen(struct socket *so)
2784 {
2785         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2786                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2787         }
2788
2789         if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
2790                 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
2791         }
2792 }
2793
2794 void
2795 sonullevent(struct socket *so, void *arg, long hint)
2796 {
2797 #pragma unused(so, arg, hint)
2798 }
2799
2800 /*
2801  * Here is the definition of some of the basic objects in the kern.ipc
2802  * branch of the MIB.
2803  */
2804 SYSCTL_NODE(_kern, KERN_IPC, ipc,
2805     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "IPC");
2806
2807 /* Check that the maximum socket buffer size is within a range */
2808
2809 static int
2810 sysctl_sb_max SYSCTL_HANDLER_ARGS
2811 {
2812 #pragma unused(oidp, arg1, arg2)
2813         u_int32_t new_value;
2814         int changed = 0;
2815         int error = sysctl_io_number(req, sb_max, sizeof(u_int32_t),
2816             &new_value, &changed);
2817         if (!error && changed) {
2818                 if (new_value > LOW_SB_MAX && new_value <= high_sb_max) {
2819                         sb_max = new_value;
2820                 } else {
2821                         error = ERANGE;
2822                 }
2823         }
2824         return error;
2825 }
2826
2827 SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf,
2828     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2829     &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size");
2830
2831 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor,
2832     CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, "");
2833
2834 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters,
2835     CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, "");
2836
2837 SYSCTL_INT(_kern_ipc, OID_AUTO, njcl,
2838     CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, "");
2839
2840 SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes,
2841     CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, "");
2842
2843 SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat,
2844     CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1,
2845     "Enable socket queue limit compatibility");
2846
2847 /*
2848  * Hack alert -- rdar://33572856
2849  * A loopback test we cannot change was failing because it sets
2850  * SO_SENDTIMEO to 5 seconds and that's also the value
2851  * of the minimum persist timer. Because of the persist timer,
2852  * the connection was not idle for 5 seconds and SO_SNDTIMEO
2853  * was not triggering at 5 seconds causing the test failure.
2854  * As a workaround we check the sysctl soqlencomp the test is already
2855  * setting to set disable auto tuning of the receive buffer.
2856  */
2857
2858 extern u_int32_t tcp_do_autorcvbuf;
2859
2860 static int
2861 sysctl_soqlencomp SYSCTL_HANDLER_ARGS
2862 {
2863 #pragma unused(oidp, arg1, arg2)
2864         u_int32_t new_value;
2865         int changed = 0;
2866         int error = sysctl_io_number(req, soqlencomp, sizeof(u_int32_t),
2867             &new_value, &changed);
2868         if (!error && changed) {
2869                 soqlencomp = new_value;
2870                 if (new_value != 0) {
2871                         tcp_do_autorcvbuf = 0;
2872                         tcptv_persmin_val = 6 * TCP_RETRANSHZ;
2873                 }
2874         }
2875         return error;
2876 }
2877 SYSCTL_PROC(_kern_ipc, OID_AUTO, soqlencomp,
2878     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2879     &soqlencomp, 0, &sysctl_soqlencomp, "IU", "");
2880
2881 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
2882     &total_sbmb_cnt, 0, "");
2883 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_peak, CTLFLAG_RD | CTLFLAG_LOCKED,
2884     &total_sbmb_cnt_peak, 0, "");
2885 SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_floor, CTLFLAG_RD | CTLFLAG_LOCKED,
2886     &total_sbmb_cnt_floor, 0, "");
2887 SYSCTL_QUAD(_kern_ipc, OID_AUTO, sbmb_limreached, CTLFLAG_RD | CTLFLAG_LOCKED,
2888     &sbmb_limreached, "");
2889
2890
2891 SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy");
2892
2893 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
2894     &net_io_policy_log, 0, "");
2895
2896 #if CONFIG_PROC_UUID_POLICY
2897 SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, uuid, CTLFLAG_RW | CTLFLAG_LOCKED,
2898     &net_io_policy_uuid, 0, "");
2899 #endif /* CONFIG_PROC_UUID_POLICY */