bsd/net/content_filter.c

   1 /*
   2  * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*
  25  * THEORY OF OPERATION
  26  *
  27  * The socket content filter subsystem provides a way for user space agents to
  28  * make filtering decisions based on the content of the data being sent and
  29  * received by TCP/IP sockets.
  30  *
  31  * A content filter user space agents gets a copy of the data and the data is
  32  * also kept in kernel buffer until the user space agents makes a pass or drop
  33  * decision. This unidirectional flow of content avoids unnecessary data copies
  34  * back to the kernel.
  35  *
  36  * A user space filter agent opens a kernel control socket with the name
  37  * CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
  38  * When connected, a "struct content_filter" is created and set as the
  39  * "unitinfo" of the corresponding kernel control socket instance.
  40  *
  41  * The socket content filter subsystem exchanges messages with the user space
  42  * filter agent until an ultimate pass or drop decision is made by the
  43  * user space filter agent.
  44  *
  45  * It should be noted that messages about many TCP/IP sockets can be multiplexed
  46  * over a single kernel control socket.
  47  *
  48  * Notes:
  49  * - The current implementation is limited to TCP sockets.
  50  * - The current implementation supports up to two simultaneous content filters
  51  *   for the sake of simplicity of the implementation.
  52  *
  53  *
  54  * NECP FILTER CONTROL UNIT
  55  *
  56  * A user space filter agent uses the Network Extension Control Policy (NECP)
  57  * database to specify which TCP/IP sockets need to be filtered. The NECP
  58  * criteria may be based on a variety of properties like user ID or proc UUID.
  59  *
  60  * The NECP "filter control unit" is used by the socket content filter subsystem
  61  * to deliver the relevant TCP/IP content information to the appropriate
  62  * user space filter agent via its kernel control socket instance.
  63  * This works as follows:
  64  *
  65  * 1) The user space filter agent specifies an NECP filter control unit when
  66  *    in adds its filtering rules to the NECP database.
  67  *
  68  * 2) The user space filter agent also sets its NECP filter control unit on the
  69  *    content filter kernel control socket via the socket option
  70  *    CFIL_OPT_NECP_CONTROL_UNIT.
  71  *
  72  * 3) The NECP database is consulted to find out if a given TCP/IP socket
  73  *    needs to be subjected to content filtering and returns the corresponding
  74  *    NECP filter control unit  -- the NECP filter control unit is actually
  75  *    stored in the TCP/IP socket structure so the NECP lookup is really simple.
  76  *
  77  * 4) The NECP filter control unit is then used to find the corresponding
  78  *    kernel control socket instance.
  79  *
  80  * Note: NECP currently supports a single filter control unit per TCP/IP socket
  81  *       but this restriction may be soon lifted.
  82  *
  83  *
  84  * THE MESSAGING PROTOCOL
  85  *
  86  * The socket content filter subsystem and a user space filter agent
  87  * communicate over the kernel control socket via an asynchronous
  88  * messaging protocol (this is not a request-response protocol).
  89  * The socket content filter subsystem sends event messages to the user
  90  * space filter agent about the TCP/IP sockets it is interested to filter.
  91  * The user space filter agent sends action messages to either allow
  92  * data to pass or to disallow the data flow (and drop the connection).
  93  *
  94  * All messages over a content filter kernel control socket share the same
  95  * common header of type "struct cfil_msg_hdr". The message type tells if
  96  * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
  97  * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
  98  * Note the message header length field may be padded for alignment and can
  99  * be larger than the actual content of the message.
 100  * The field "cfm_op" describe the kind of event or action.
 101  *
 102  * Here are the kinds of content filter events:
 103  * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
 104  * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
 105  * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
 106  * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
 107  *
 108  *
 109  * EVENT MESSAGES
 110  *
 111  * The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
 112  * data that is being sent or received. The position of this span of data
 113  * in the data flow is described by a set of start and end offsets. These
 114  * are absolute 64 bits offsets. The first byte sent (or received) starts
 115  * at offset 0 and ends at offset 1. The length of the content data
 116  * is given by the difference between the end offset and the start offset.
 117  *
 118  * After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
 119  * CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
 120  * action message is sent by the user space filter agent.
 121  *
 122  * Note: absolute 64 bits offsets should be large enough for the foreseeable
 123  * future.  A 64-bits counter will wrap after 468 years at 10 Gbit/sec:
 124  *   2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
 125  *
 126  * They are two kinds of primary content filter actions:
 127  * - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
 128  * - CFM_OP_DROP: to shutdown socket and disallow further data flow
 129  *
 130  * There is also an action to mark a given client flow as already filtered
 131  * at a higher level, CFM_OP_BLESS_CLIENT.
 132  *
 133  *
 134  * ACTION MESSAGES
 135  *
 136  * The CFM_OP_DATA_UPDATE action messages let the user space filter
 137  * agent allow data to flow up to the specified pass offset -- there
 138  * is a pass offset for outgoing data and  a pass offset for incoming data.
 139  * When a new TCP/IP socket is attached to the content filter, each pass offset
 140  * is initially set to 0 so not data is allowed to pass by default.
 141  * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 142  * then the data flow becomes unrestricted.
 143  *
 144  * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
 145  * with a pass offset smaller than the pass offset of a previous
 146  * CFM_OP_DATA_UPDATE message is silently ignored.
 147  *
 148  * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
 149  * to tell the kernel how much data it wants to see by using the peek offsets.
 150  * Just like pass offsets, there is a peek offset for each direction.
 151  * When a new TCP/IP socket is attached to the content filter, each peek offset
 152  * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
 153  * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
 154  * with a greater than 0 peek offset is sent by the user space filter agent.
 155  * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 156  * then the flow of update data events becomes unrestricted.
 157  *
 158  * Note that peek offsets cannot be smaller than the corresponding pass offset.
 159  * Also a peek offsets cannot be smaller than the corresponding end offset
 160  * of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
 161  * to set a too small peek value is silently ignored.
 162  *
 163  *
 164  * PER SOCKET "struct cfil_info"
 165  *
 166  * As soon as a TCP/IP socket gets attached to a content filter, a
 167  * "struct cfil_info" is created to hold the content filtering state for this
 168  * socket.
 169  *
 170  * The content filtering state is made of the following information
 171  * for each direction:
 172  * - The current pass offset;
 173  * - The first and last offsets of the data pending, waiting for a filtering
 174  *   decision;
 175  * - The inject queue for data that passed the filters and that needs
 176  *   to be re-injected;
 177  * - A content filter specific state in a set of  "struct cfil_entry"
 178  *
 179  *
 180  * CONTENT FILTER STATE "struct cfil_entry"
 181  *
 182  * The "struct cfil_entry" maintains the information most relevant to the
 183  * message handling over a kernel control socket with a user space filter agent.
 184  *
 185  * The "struct cfil_entry" holds the NECP filter control unit that corresponds
 186  * to the kernel control socket unit it corresponds to and also has a pointer
 187  * to the corresponding "struct content_filter".
 188  *
 189  * For each direction, "struct cfil_entry" maintains the following information:
 190  * - The pass offset
 191  * - The peek offset
 192  * - The offset of the last data peeked at by the filter
 193  * - A queue of data that's waiting to be delivered to the  user space filter
 194  *   agent on the kernel control socket
 195  * - A queue of data for which event messages have been sent on the kernel
 196  *   control socket and are pending for a filtering decision.
 197  *
 198  *
 199  * CONTENT FILTER QUEUES
 200  *
 201  * Data that is being filtered is steered away from the TCP/IP socket buffer
 202  * and instead will sit in one of three content filter queues until the data
 203  * can be re-injected into the TCP/IP socket buffer.
 204  *
 205  * A content filter queue is represented by "struct cfil_queue" that contains
 206  * a list of mbufs and the start and end offset of the data span of
 207  * the list of mbufs.
 208  *
 209  * The data moves into the three content filter queues according to this
 210  * sequence:
 211  * a) The "cfe_ctl_q" of "struct cfil_entry"
 212  * b) The "cfe_pending_q" of "struct cfil_entry"
 213  * c) The "cfi_inject_q" of "struct cfil_info"
 214  *
 215  * Note: The sequence (a),(b) may be repeated several times if there is more
 216  * than one content filter attached to the TCP/IP socket.
 217  *
 218  * The "cfe_ctl_q" queue holds data than cannot be delivered to the
 219  * kernel conntrol socket for two reasons:
 220  * - The peek offset is less that the end offset of the mbuf data
 221  * - The kernel control socket is flow controlled
 222  *
 223  * The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
 224  * CFM_OP_DATA_IN have been successfully dispatched to the kernel control
 225  * socket and are waiting for a pass action message fromn the user space
 226  * filter agent. An mbuf length must be fully allowed to pass to be removed
 227  * from the cfe_pending_q.
 228  *
 229  * The "cfi_inject_q" queue holds data that has been fully allowed to pass
 230  * by the user space filter agent and that needs to be re-injected into the
 231  * TCP/IP socket.
 232  *
 233  *
 234  * IMPACT ON FLOW CONTROL
 235  *
 236  * An essential aspect of the content filer subsystem is to minimize the
 237  * impact on flow control of the TCP/IP sockets being filtered.
 238  *
 239  * The processing overhead of the content filtering may have an effect on
 240  * flow control by adding noticeable delays and cannot be eliminated --
 241  * care must be taken by the user space filter agent to minimize the
 242  * processing delays.
 243  *
 244  * The amount of data being filtered is kept in buffers while waiting for
 245  * a decision by the user space filter agent. This amount of data pending
 246  * needs to be subtracted from the amount of data available in the
 247  * corresponding TCP/IP socket buffer. This is done by modifying
 248  * sbspace() and tcp_sbspace() to account for amount of data pending
 249  * in the content filter.
 250  *
 251  *
 252  * LOCKING STRATEGY
 253  *
 254  * The global state of content filter subsystem is protected by a single
 255  * read-write lock "cfil_lck_rw". The data flow can be done with the
 256  * cfil read-write lock held as shared so it can be re-entered from multiple
 257  * threads.
 258  *
 259  * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
 260  * protected by the socket lock.
 261  *
 262  * A TCP/IP socket lock cannot be taken while the cfil read-write lock
 263  * is held. That's why we have some sequences where we drop the cfil read-write
 264  * lock before taking the TCP/IP lock.
 265  *
 266  * It is also important to lock the TCP/IP socket buffer while the content
 267  * filter is modifying the amount of pending data. Otherwise the calculations
 268  * in sbspace() and tcp_sbspace()  could be wrong.
 269  *
 270  * The "cfil_lck_rw" protects "struct content_filter" and also the fields
 271  * "cfe_link" and "cfe_filter" of "struct cfil_entry".
 272  *
 273  * Actually "cfe_link" and "cfe_filter" are protected by both by
 274  * "cfil_lck_rw" and the socket lock: they may be modified only when
 275  * "cfil_lck_rw" is exclusive and the socket is locked.
 276  *
 277  * To read the other fields of "struct content_filter" we have to take
 278  * "cfil_lck_rw" in shared mode.
 279  *
 280  *
 281  * LIMITATIONS
 282  *
 283  * - For TCP sockets only
 284  *
 285  * - Does not support TCP unordered messages
 286  */
 287
 288 /*
 289  *      TO DO LIST
 290  *
 291  *      SOONER:
 292  *
 293  *      Deal with OOB
 294  *
 295  *      LATER:
 296  *
 297  *      If support datagram, enqueue control and address mbufs as well
 298  */
 299
 300 #include <sys/types.h>
 301 #include <sys/kern_control.h>
 302 #include <sys/queue.h>
 303 #include <sys/domain.h>
 304 #include <sys/protosw.h>
 305 #include <sys/syslog.h>
 306
 307 #include <kern/locks.h>
 308 #include <kern/zalloc.h>
 309 #include <kern/debug.h>
 310
 311 #include <net/content_filter.h>
 312
 313 #include <netinet/in_pcb.h>
 314 #include <netinet/tcp.h>
 315 #include <netinet/tcp_var.h>
 316
 317 #include <string.h>
 318 #include <libkern/libkern.h>
 319
 320
 321 #define MAX_CONTENT_FILTER 2
 322
 323 struct cfil_entry;
 324
 325 /*
 326  * The structure content_filter represents a user space content filter
 327  * It's created and associated with a kernel control socket instance
 328  */
 329 struct content_filter {
 330         kern_ctl_ref            cf_kcref;
 331         u_int32_t               cf_kcunit;
 332         u_int32_t               cf_flags;
 333
 334         uint32_t                cf_necp_control_unit;
 335
 336         uint32_t                cf_sock_count;
 337         TAILQ_HEAD(, cfil_entry) cf_sock_entries;
 338 };
 339
 340 #define CFF_ACTIVE              0x01
 341 #define CFF_DETACHING           0x02
 342 #define CFF_FLOW_CONTROLLED     0x04
 343
 344 struct content_filter **content_filters = NULL;
 345 uint32_t cfil_active_count = 0; /* Number of active content filters */
 346 uint32_t cfil_sock_attached_count = 0;  /* Number of sockets attachements */
 347 uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
 348
 349 static kern_ctl_ref cfil_kctlref = NULL;
 350
 351 static lck_grp_attr_t *cfil_lck_grp_attr = NULL;
 352 static lck_attr_t *cfil_lck_attr = NULL;
 353 static lck_grp_t *cfil_lck_grp = NULL;
 354 decl_lck_rw_data(static, cfil_lck_rw);
 355
 356 #define CFIL_RW_LCK_MAX 8
 357
 358 int cfil_rw_nxt_lck = 0;
 359 void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
 360
 361 int cfil_rw_nxt_unlck = 0;
 362 void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
 363
 364 #define CONTENT_FILTER_ZONE_NAME        "content_filter"
 365 #define CONTENT_FILTER_ZONE_MAX         10
 366 static struct zone *content_filter_zone = NULL; /* zone for content_filter */
 367
 368
 369 #define CFIL_INFO_ZONE_NAME     "cfil_info"
 370 #define CFIL_INFO_ZONE_MAX      1024
 371 static struct zone *cfil_info_zone = NULL;      /* zone for cfil_info */
 372
 373 MBUFQ_HEAD(cfil_mqhead);
 374
 375 struct cfil_queue {
 376         uint64_t                q_start; /* offset of first byte in queue */
 377         uint64_t                q_end; /* offset of last byte in queue */
 378         struct cfil_mqhead      q_mq;
 379 };
 380
 381 /*
 382  * struct cfil_entry
 383  *
 384  * The is one entry per content filter
 385  */
 386 struct cfil_entry {
 387         TAILQ_ENTRY(cfil_entry) cfe_link;
 388         struct content_filter   *cfe_filter;
 389
 390         struct cfil_info        *cfe_cfil_info;
 391         uint32_t                cfe_flags;
 392         uint32_t                cfe_necp_control_unit;
 393         struct timeval          cfe_last_event; /* To user space */
 394         struct timeval          cfe_last_action; /* From user space */
 395
 396         struct cfe_buf {
 397                 /*
 398                  * cfe_pending_q holds data that has been delivered to
 399                  * the filter and for which we are waiting for an action
 400                  */
 401                 struct cfil_queue       cfe_pending_q;
 402                 /*
 403                  * This queue is for data that has not be delivered to
 404                  * the content filter (new data, pass peek or flow control)
 405                  */
 406                 struct cfil_queue       cfe_ctl_q;
 407
 408                 uint64_t                cfe_pass_offset;
 409                 uint64_t                cfe_peek_offset;
 410                 uint64_t                cfe_peeked;
 411         } cfe_snd, cfe_rcv;
 412 };
 413
 414 #define CFEF_CFIL_ATTACHED              0x0001  /* was attached to filter */
 415 #define CFEF_SENT_SOCK_ATTACHED         0x0002  /* sock attach event was sent */
 416 #define CFEF_DATA_START                 0x0004  /* can send data event */
 417 #define CFEF_FLOW_CONTROLLED            0x0008  /* wait for flow control lift */
 418 #define CFEF_SENT_DISCONNECT_IN         0x0010  /* event was sent */
 419 #define CFEF_SENT_DISCONNECT_OUT        0x0020  /* event was sent */
 420 #define CFEF_SENT_SOCK_CLOSED           0x0040  /* closed event was sent */
 421 #define CFEF_CFIL_DETACHED              0x0080  /* filter was detached */
 422
 423
 424 #define CFI_ADD_TIME_LOG(cfil, t1, t0, op)                                                                                      \
 425                 struct timeval _tdiff;                                                                                          \
 426                 if ((cfil)->cfi_op_list_ctr < CFI_MAX_TIME_LOG_ENTRY) {                                                         \
 427                         timersub(t1, t0, &_tdiff);                                                                              \
 428                         (cfil)->cfi_op_time[(cfil)->cfi_op_list_ctr] = (uint32_t)(_tdiff.tv_sec * 1000 + _tdiff.tv_usec / 1000);\
 429                         (cfil)->cfi_op_list[(cfil)->cfi_op_list_ctr] = (unsigned char)op;                                       \
 430                         (cfil)->cfi_op_list_ctr ++;                                                                             \
 431                 }
 432
 433 /*
 434  * struct cfil_info
 435  *
 436  * There is a struct cfil_info per socket
 437  */
 438 struct cfil_info {
 439         TAILQ_ENTRY(cfil_info)  cfi_link;
 440         struct socket           *cfi_so;
 441         uint64_t                cfi_flags;
 442         uint64_t                cfi_sock_id;
 443         struct timeval64        cfi_first_event;
 444         uint32_t                cfi_op_list_ctr;
 445         uint32_t                cfi_op_time[CFI_MAX_TIME_LOG_ENTRY];    /* time interval in microseconds since first event */
 446         unsigned char           cfi_op_list[CFI_MAX_TIME_LOG_ENTRY];
 447
 448         struct cfi_buf {
 449                 /*
 450                  * cfi_pending_first and cfi_pending_last describe the total
 451                  * amount of data outstanding for all the filters on
 452                  * this socket and data in the flow queue
 453                  * cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
 454                  */
 455                 uint64_t                cfi_pending_first;
 456                 uint64_t                cfi_pending_last;
 457                 int                     cfi_pending_mbcnt;
 458                 /*
 459                  * cfi_pass_offset is the minimum of all the filters
 460                  */
 461                 uint64_t                cfi_pass_offset;
 462                 /*
 463                  * cfi_inject_q holds data that needs to be re-injected
 464                  * into the socket after filtering and that can
 465                  * be queued because of flow control
 466                  */
 467                 struct cfil_queue       cfi_inject_q;
 468         } cfi_snd, cfi_rcv;
 469
 470         struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
 471 } __attribute__((aligned(8)));
 472
 473 #define CFIF_DROP               0x0001  /* drop action applied */
 474 #define CFIF_CLOSE_WAIT         0x0002  /* waiting for filter to close */
 475 #define CFIF_SOCK_CLOSED        0x0004  /* socket is closed */
 476 #define CFIF_RETRY_INJECT_IN    0x0010  /* inject in failed */
 477 #define CFIF_RETRY_INJECT_OUT   0x0020  /* inject out failed */
 478 #define CFIF_SHUT_WR            0x0040  /* shutdown write */
 479 #define CFIF_SHUT_RD            0x0080  /* shutdown read */
 480
 481 #define CFI_MASK_GENCNT         0xFFFFFFFF00000000      /* upper 32 bits */
 482 #define CFI_SHIFT_GENCNT        32
 483 #define CFI_MASK_FLOWHASH       0x00000000FFFFFFFF      /* lower 32 bits */
 484 #define CFI_SHIFT_FLOWHASH      0
 485
 486 TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 487
 488 #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
 489 #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
 490
 491 /*
 492  * Statistics
 493  */
 494
 495 struct cfil_stats cfil_stats;
 496
 497 /*
 498  * For troubleshooting
 499  */
 500 int cfil_log_level = LOG_ERR;
 501 int cfil_debug = 1;
 502
 503 /*
 504  * Sysctls for logs and statistics
 505  */
 506 static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
 507         struct sysctl_req *);
 508 static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
 509         struct sysctl_req *);
 510
 511 SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cfil");
 512
 513 SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW|CTLFLAG_LOCKED,
 514         &cfil_log_level, 0, "");
 515
 516 SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW|CTLFLAG_LOCKED,
 517         &cfil_debug, 0, "");
 518
 519 SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD|CTLFLAG_LOCKED,
 520         &cfil_sock_attached_count, 0, "");
 521
 522 SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD|CTLFLAG_LOCKED,
 523         &cfil_active_count, 0, "");
 524
 525 SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW|CTLFLAG_LOCKED,
 526         &cfil_close_wait_timeout, 0, "");
 527
 528 static int cfil_sbtrim = 1;
 529 SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW|CTLFLAG_LOCKED,
 530         &cfil_sbtrim, 0, "");
 531
 532 SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD|CTLFLAG_LOCKED,
 533         0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat",  "");
 534
 535 SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD|CTLFLAG_LOCKED,
 536         0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat",  "");
 537
 538 SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED,
 539         &cfil_stats, cfil_stats, "");
 540
 541 /*
 542  * Forward declaration to appease the compiler
 543  */
 544 static int cfil_action_data_pass(struct socket *, uint32_t, int,
 545         uint64_t, uint64_t);
 546 static int cfil_action_drop(struct socket *, uint32_t);
 547 static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
 548 static int cfil_dispatch_closed_event(struct socket *, int);
 549 static int cfil_data_common(struct socket *, int, struct sockaddr *,
 550         struct mbuf *, struct mbuf *, uint32_t);
 551 static int cfil_data_filter(struct socket *, uint32_t, int,
 552         struct mbuf *, uint64_t);
 553 static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
 554         struct in_addr, u_int16_t);
 555 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
 556         struct in6_addr *, u_int16_t);
 557 static int cfil_dispatch_attach_event(struct socket *, uint32_t);
 558 static void cfil_info_free(struct socket *, struct cfil_info *);
 559 static struct cfil_info * cfil_info_alloc(struct socket *);
 560 static int cfil_info_attach_unit(struct socket *, uint32_t);
 561 static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t);
 562 static struct socket *cfil_socket_from_client_uuid(uuid_t, bool *);
 563 static int cfil_service_pending_queue(struct socket *, uint32_t, int);
 564 static int cfil_data_service_ctl_q(struct socket *, uint32_t, int);
 565 static void cfil_info_verify(struct cfil_info *);
 566 static int cfil_update_data_offsets(struct socket *, uint32_t, int,
 567         uint64_t, uint64_t);
 568 static int cfil_acquire_sockbuf(struct socket *, int);
 569 static void cfil_release_sockbuf(struct socket *, int);
 570 static int cfil_filters_attached(struct socket *);
 571
 572 static void cfil_rw_lock_exclusive(lck_rw_t *);
 573 static void cfil_rw_unlock_exclusive(lck_rw_t *);
 574 static void cfil_rw_lock_shared(lck_rw_t *);
 575 static void cfil_rw_unlock_shared(lck_rw_t *);
 576 static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
 577 static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
 578
 579 static unsigned int cfil_data_length(struct mbuf *, int *);
 580
 581 /*
 582  * Content filter global read write lock
 583  */
 584
 585 static void
 586 cfil_rw_lock_exclusive(lck_rw_t *lck)
 587 {
 588         void *lr_saved;
 589
 590         lr_saved = __builtin_return_address(0);
 591
 592         lck_rw_lock_exclusive(lck);
 593
 594         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 595         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 596 }
 597
 598 static void
 599 cfil_rw_unlock_exclusive(lck_rw_t *lck)
 600 {
 601         void *lr_saved;
 602
 603         lr_saved = __builtin_return_address(0);
 604
 605         lck_rw_unlock_exclusive(lck);
 606
 607         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 608         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 609 }
 610
 611 static void
 612 cfil_rw_lock_shared(lck_rw_t *lck)
 613 {
 614         void *lr_saved;
 615
 616         lr_saved = __builtin_return_address(0);
 617
 618         lck_rw_lock_shared(lck);
 619
 620         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 621         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 622 }
 623
 624 static void
 625 cfil_rw_unlock_shared(lck_rw_t *lck)
 626 {
 627         void *lr_saved;
 628
 629         lr_saved = __builtin_return_address(0);
 630
 631         lck_rw_unlock_shared(lck);
 632
 633         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 634         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 635 }
 636
 637 static boolean_t
 638 cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
 639 {
 640         void *lr_saved;
 641         boolean_t upgraded;
 642
 643         lr_saved = __builtin_return_address(0);
 644
 645         upgraded = lck_rw_lock_shared_to_exclusive(lck);
 646         if (upgraded) {
 647                 cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 648                 cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 649         }
 650         return (upgraded);
 651 }
 652
 653 static void
 654 cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
 655 {
 656         void *lr_saved;
 657
 658         lr_saved = __builtin_return_address(0);
 659
 660         lck_rw_lock_exclusive_to_shared(lck);
 661
 662         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 663         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 664 }
 665
 666 static void
 667 cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
 668 {
 669 #if !MACH_ASSERT
 670 #pragma unused(lck, exclusive)
 671 #endif
 672         LCK_RW_ASSERT(lck,
 673             exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
 674 }
 675
 676 /*
 677  * Return the number of bytes in the mbuf chain using the same
 678  * method as m_length() or sballoc()
 679  */
 680 static unsigned int
 681 cfil_data_length(struct mbuf *m, int *retmbcnt)
 682 {
 683         struct mbuf *m0;
 684         unsigned int pktlen;
 685         int mbcnt;
 686
 687         if (retmbcnt == NULL)
 688                 return (m_length(m));
 689
 690         pktlen = 0;
 691         mbcnt = 0;
 692         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
 693                 pktlen += m0->m_len;
 694                 mbcnt += MSIZE;
 695                 if (m0->m_flags & M_EXT)
 696                         mbcnt += m0->m_ext.ext_size;
 697         }
 698         *retmbcnt = mbcnt;
 699         return (pktlen);
 700 }
 701
 702 /*
 703  * Common mbuf queue utilities
 704  */
 705
 706 static inline void
 707 cfil_queue_init(struct cfil_queue *cfq)
 708 {
 709         cfq->q_start = 0;
 710         cfq->q_end = 0;
 711         MBUFQ_INIT(&cfq->q_mq);
 712 }
 713
 714 static inline uint64_t
 715 cfil_queue_drain(struct cfil_queue *cfq)
 716 {
 717         uint64_t drained = cfq->q_start - cfq->q_end;
 718         cfq->q_start = 0;
 719         cfq->q_end = 0;
 720         MBUFQ_DRAIN(&cfq->q_mq);
 721
 722         return (drained);
 723 }
 724
 725 /* Return 1 when empty, 0 otherwise */
 726 static inline int
 727 cfil_queue_empty(struct cfil_queue *cfq)
 728 {
 729         return (MBUFQ_EMPTY(&cfq->q_mq));
 730 }
 731
 732 static inline uint64_t
 733 cfil_queue_offset_first(struct cfil_queue *cfq)
 734 {
 735         return (cfq->q_start);
 736 }
 737
 738 static inline uint64_t
 739 cfil_queue_offset_last(struct cfil_queue *cfq)
 740 {
 741         return (cfq->q_end);
 742 }
 743
 744 static inline uint64_t
 745 cfil_queue_len(struct cfil_queue *cfq)
 746 {
 747         return (cfq->q_end - cfq->q_start);
 748 }
 749
 750 /*
 751  * Routines to verify some fundamental assumptions
 752  */
 753
 754 static void
 755 cfil_queue_verify(struct cfil_queue *cfq)
 756 {
 757         mbuf_t m;
 758         mbuf_t n;
 759         uint64_t queuesize = 0;
 760
 761         /* Verify offset are ordered */
 762         VERIFY(cfq->q_start <= cfq->q_end);
 763
 764         /*
 765          * When queue is empty, the offsets are equal otherwise the offsets
 766          * are different
 767          */
 768         VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
 769                 (!MBUFQ_EMPTY(&cfq->q_mq) &&
 770                 cfq->q_start != cfq->q_end));
 771
 772         MBUFQ_FOREACH(m, &cfq->q_mq) {
 773                 size_t chainsize = 0;
 774                 unsigned int mlen = m_length(m);
 775
 776                 if (m == (void *)M_TAG_FREE_PATTERN ||
 777                         m->m_next == (void *)M_TAG_FREE_PATTERN ||
 778                         m->m_nextpkt == (void *)M_TAG_FREE_PATTERN)
 779                         panic("%s - mq %p is free at %p", __func__,
 780                                 &cfq->q_mq, m);
 781                 for (n = m; n != NULL; n = n->m_next) {
 782                         if (n->m_type != MT_DATA &&
 783                                 n->m_type != MT_HEADER &&
 784                                 n->m_type != MT_OOBDATA)
 785                         panic("%s - %p unsupported type %u", __func__,
 786                                 n, n->m_type);
 787                         chainsize += n->m_len;
 788                 }
 789                 if (mlen != chainsize)
 790                         panic("%s - %p m_length() %u != chainsize %lu",
 791                                 __func__, m, mlen, chainsize);
 792                 queuesize += chainsize;
 793         }
 794         if (queuesize != cfq->q_end - cfq->q_start)
 795                 panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
 796                         m, queuesize, cfq->q_end - cfq->q_start);
 797 }
 798
 799 static void
 800 cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
 801 {
 802         CFIL_QUEUE_VERIFY(cfq);
 803
 804         MBUFQ_ENQUEUE(&cfq->q_mq, m);
 805         cfq->q_end += len;
 806
 807         CFIL_QUEUE_VERIFY(cfq);
 808 }
 809
 810 static void
 811 cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
 812 {
 813         CFIL_QUEUE_VERIFY(cfq);
 814
 815         VERIFY(m_length(m) == len);
 816
 817         MBUFQ_REMOVE(&cfq->q_mq, m);
 818         MBUFQ_NEXT(m) = NULL;
 819         cfq->q_start += len;
 820
 821         CFIL_QUEUE_VERIFY(cfq);
 822 }
 823
 824 static mbuf_t
 825 cfil_queue_first(struct cfil_queue *cfq)
 826 {
 827         return (MBUFQ_FIRST(&cfq->q_mq));
 828 }
 829
 830 static mbuf_t
 831 cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
 832 {
 833 #pragma unused(cfq)
 834         return (MBUFQ_NEXT(m));
 835 }
 836
 837 static void
 838 cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
 839 {
 840         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
 841         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
 842
 843         /* Verify the queues are ordered so that pending is before ctl */
 844         VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
 845
 846         /* The peek offset cannot be less than the pass offset */
 847         VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
 848
 849         /* Make sure we've updated the offset we peeked at  */
 850         VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
 851 }
 852
 853 static void
 854 cfil_entry_verify(struct cfil_entry *entry)
 855 {
 856         cfil_entry_buf_verify(&entry->cfe_snd);
 857         cfil_entry_buf_verify(&entry->cfe_rcv);
 858 }
 859
 860 static void
 861 cfil_info_buf_verify(struct cfi_buf *cfi_buf)
 862 {
 863         CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
 864
 865         VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
 866         VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
 867 }
 868
 869 static void
 870 cfil_info_verify(struct cfil_info *cfil_info)
 871 {
 872         int i;
 873
 874         if (cfil_info == NULL)
 875                 return;
 876
 877         cfil_info_buf_verify(&cfil_info->cfi_snd);
 878         cfil_info_buf_verify(&cfil_info->cfi_rcv);
 879
 880         for (i = 0; i < MAX_CONTENT_FILTER; i++)
 881                 cfil_entry_verify(&cfil_info->cfi_entries[i]);
 882 }
 883
 884 static void
 885 verify_content_filter(struct content_filter *cfc)
 886 {
 887         struct cfil_entry *entry;
 888         uint32_t count = 0;
 889
 890         VERIFY(cfc->cf_sock_count >= 0);
 891
 892         TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
 893                 count++;
 894                 VERIFY(cfc == entry->cfe_filter);
 895         }
 896         VERIFY(count == cfc->cf_sock_count);
 897 }
 898
 899 /*
 900  * Kernel control socket callbacks
 901  */
 902 static errno_t
 903 cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
 904                 void **unitinfo)
 905 {
 906         errno_t error = 0;
 907         struct content_filter *cfc = NULL;
 908
 909         CFIL_LOG(LOG_NOTICE, "");
 910
 911         cfc = zalloc(content_filter_zone);
 912         if (cfc == NULL) {
 913                 CFIL_LOG(LOG_ERR, "zalloc failed");
 914                 error = ENOMEM;
 915                 goto done;
 916         }
 917         bzero(cfc, sizeof(struct content_filter));
 918
 919         cfil_rw_lock_exclusive(&cfil_lck_rw);
 920         if (content_filters == NULL) {
 921                 struct content_filter **tmp;
 922
 923                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
 924
 925                 MALLOC(tmp,
 926                         struct content_filter **,
 927                         MAX_CONTENT_FILTER * sizeof(struct content_filter *),
 928                         M_TEMP,
 929                         M_WAITOK | M_ZERO);
 930
 931                 cfil_rw_lock_exclusive(&cfil_lck_rw);
 932
 933                 if (tmp == NULL && content_filters == NULL) {
 934                         error = ENOMEM;
 935                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
 936                         goto done;
 937                 }
 938                 /* Another thread may have won the race */
 939                 if (content_filters != NULL)
 940                         FREE(tmp, M_TEMP);
 941                 else
 942                         content_filters = tmp;
 943         }
 944
 945         if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
 946                 CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
 947                 error = EINVAL;
 948         } else if (content_filters[sac->sc_unit - 1] != NULL) {
 949                 CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
 950                 error = EADDRINUSE;
 951         } else {
 952                 /*
 953                  * kernel control socket kcunit numbers start at 1
 954                  */
 955                 content_filters[sac->sc_unit - 1] = cfc;
 956
 957                 cfc->cf_kcref = kctlref;
 958                 cfc->cf_kcunit = sac->sc_unit;
 959                 TAILQ_INIT(&cfc->cf_sock_entries);
 960
 961                 *unitinfo = cfc;
 962                 cfil_active_count++;
 963         }
 964         cfil_rw_unlock_exclusive(&cfil_lck_rw);
 965 done:
 966         if (error != 0 && cfc != NULL)
 967                 zfree(content_filter_zone, cfc);
 968
 969         if (error == 0)
 970                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
 971         else
 972                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
 973
 974         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
 975                 error, cfil_active_count, sac->sc_unit);
 976
 977         return (error);
 978 }
 979
 980 static errno_t
 981 cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
 982 {
 983 #pragma unused(kctlref)
 984         errno_t error = 0;
 985         struct content_filter *cfc;
 986         struct cfil_entry *entry;
 987
 988         CFIL_LOG(LOG_NOTICE, "");
 989
 990         if (content_filters == NULL) {
 991                 CFIL_LOG(LOG_ERR, "no content filter");
 992                 error = EINVAL;
 993                 goto done;
 994         }
 995         if (kcunit > MAX_CONTENT_FILTER) {
 996                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
 997                         kcunit, MAX_CONTENT_FILTER);
 998                 error = EINVAL;
 999                 goto done;
1000         }
1001
1002         cfc = (struct content_filter *)unitinfo;
1003         if (cfc == NULL)
1004                 goto done;
1005
1006         cfil_rw_lock_exclusive(&cfil_lck_rw);
1007         if (content_filters[kcunit - 1] != cfc || cfc->cf_kcunit != kcunit) {
1008                 CFIL_LOG(LOG_ERR, "bad unit info %u)",
1009                         kcunit);
1010                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
1011                 goto done;
1012         }
1013         cfc->cf_flags |= CFF_DETACHING;
1014         /*
1015          * Remove all sockets from the filter
1016          */
1017         while ((entry = TAILQ_FIRST(&cfc->cf_sock_entries)) != NULL) {
1018                 cfil_rw_lock_assert_held(&cfil_lck_rw, 1);
1019
1020                 verify_content_filter(cfc);
1021                 /*
1022                  * Accept all outstanding data by pushing to next filter
1023                  * or back to socket
1024                  *
1025                  * TBD: Actually we should make sure all data has been pushed
1026                  * back to socket
1027                  */
1028                 if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
1029                         struct cfil_info *cfil_info = entry->cfe_cfil_info;
1030                         struct socket *so = cfil_info->cfi_so;
1031
1032                         /* Need to let data flow immediately */
1033                         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
1034                                 CFEF_DATA_START;
1035
1036                         /*
1037                          * Respect locking hierarchy
1038                          */
1039                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1040
1041                         socket_lock(so, 1);
1042
1043                         /*
1044                          * When cfe_filter is NULL the filter is detached
1045                          * and the entry has been removed from cf_sock_entries
1046                          */
1047                         if (so->so_cfil == NULL || entry->cfe_filter == NULL) {
1048                                 cfil_rw_lock_exclusive(&cfil_lck_rw);
1049                                 goto release;
1050                         }
1051                         (void) cfil_action_data_pass(so, kcunit, 1,
1052                                         CFM_MAX_OFFSET,
1053                                         CFM_MAX_OFFSET);
1054
1055                         (void) cfil_action_data_pass(so, kcunit, 0,
1056                                         CFM_MAX_OFFSET,
1057                                         CFM_MAX_OFFSET);
1058
1059                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1060
1061                         /*
1062                          * Check again as the socket may have been unlocked
1063                          * when when calling cfil_acquire_sockbuf()
1064                          */
1065                         if (so->so_cfil == NULL || entry->cfe_filter == NULL)
1066                                 goto release;
1067
1068                         /* The filter is now detached */
1069                         entry->cfe_flags |= CFEF_CFIL_DETACHED;
1070                         CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
1071                                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1072
1073                         if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
1074                             cfil_filters_attached(so) == 0) {
1075                                 CFIL_LOG(LOG_NOTICE, "so %llx waking",
1076                                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1077                                 wakeup((caddr_t)&so->so_cfil);
1078                         }
1079
1080                         /*
1081                          * Remove the filter entry from the content filter
1082                          * but leave the rest of the state intact as the queues
1083                          * may not be empty yet
1084                          */
1085                         entry->cfe_filter = NULL;
1086                         entry->cfe_necp_control_unit = 0;
1087
1088                         TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1089                         cfc->cf_sock_count--;
1090 release:
1091                         socket_unlock(so, 1);
1092                 }
1093         }
1094         verify_content_filter(cfc);
1095
1096         VERIFY(cfc->cf_sock_count == 0);
1097
1098         /*
1099          * Make filter inactive
1100          */
1101         content_filters[kcunit - 1] = NULL;
1102         cfil_active_count--;
1103         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1104
1105         zfree(content_filter_zone, cfc);
1106 done:
1107         if (error == 0)
1108                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_ok);
1109         else
1110                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_fail);
1111
1112         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1113                 error, cfil_active_count, kcunit);
1114
1115         return (error);
1116 }
1117
1118 /*
1119  * cfil_acquire_sockbuf()
1120  *
1121  * Prevent any other thread from acquiring the sockbuf
1122  * We use sb_cfil_thread as a semaphore to prevent other threads from
1123  * messing with the sockbuf -- see sblock()
1124  * Note: We do not set SB_LOCK here because the thread may check or modify
1125  * SB_LOCK several times until it calls cfil_release_sockbuf() -- currently
1126  * sblock(), sbunlock() or sodefunct()
1127  */
1128 static int
1129 cfil_acquire_sockbuf(struct socket *so, int outgoing)
1130 {
1131         thread_t tp = current_thread();
1132         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1133         lck_mtx_t *mutex_held;
1134         int error = 0;
1135
1136         /*
1137          * Wait until no thread is holding the sockbuf and other content
1138          * filter threads have released the sockbuf
1139          */
1140         while ((sb->sb_flags & SB_LOCK) ||
1141                 (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)) {
1142                 if (so->so_proto->pr_getlock != NULL)
1143                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1144                 else
1145                         mutex_held = so->so_proto->pr_domain->dom_mtx;
1146
1147                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1148
1149                 sb->sb_wantlock++;
1150                 VERIFY(sb->sb_wantlock != 0);
1151
1152                 msleep(&sb->sb_flags, mutex_held, PSOCK, "cfil_acquire_sockbuf",
1153                         NULL);
1154
1155                 VERIFY(sb->sb_wantlock != 0);
1156                 sb->sb_wantlock--;
1157         }
1158         /*
1159          * Use reference count for repetitive calls on same thread
1160          */
1161         if (sb->sb_cfil_refs == 0) {
1162                 VERIFY(sb->sb_cfil_thread == NULL);
1163                 VERIFY((sb->sb_flags & SB_LOCK) == 0);
1164
1165                 sb->sb_cfil_thread = tp;
1166                 sb->sb_flags |= SB_LOCK;
1167         }
1168         sb->sb_cfil_refs++;
1169
1170         /* We acquire the socket buffer when we need to cleanup */
1171         if (so->so_cfil == NULL) {
1172                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
1173                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1174                 error = 0;
1175         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1176                 CFIL_LOG(LOG_ERR, "so %llx drop set",
1177                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1178                 error = EPIPE;
1179         }
1180
1181         return (error);
1182 }
1183
1184 static void
1185 cfil_release_sockbuf(struct socket *so, int outgoing)
1186 {
1187         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1188         thread_t tp = current_thread();
1189
1190         socket_lock_assert_owned(so);
1191
1192         if (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)
1193                 panic("%s sb_cfil_thread %p not current %p", __func__,
1194                         sb->sb_cfil_thread, tp);
1195         /*
1196          * Don't panic if we are defunct because SB_LOCK has
1197          * been cleared by sodefunct()
1198          */
1199         if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK))
1200                 panic("%s SB_LOCK not set on %p", __func__,
1201                         sb);
1202         /*
1203          * We can unlock when the thread unwinds to the last reference
1204          */
1205         sb->sb_cfil_refs--;
1206         if (sb->sb_cfil_refs == 0) {
1207                 sb->sb_cfil_thread = NULL;
1208                 sb->sb_flags &= ~SB_LOCK;
1209
1210                 if (sb->sb_wantlock > 0)
1211                         wakeup(&sb->sb_flags);
1212         }
1213 }
1214
1215 cfil_sock_id_t
1216 cfil_sock_id_from_socket(struct socket *so)
1217 {
1218         if ((so->so_flags & SOF_CONTENT_FILTER) && so->so_cfil)
1219                 return (so->so_cfil->cfi_sock_id);
1220         else
1221                 return (CFIL_SOCK_ID_NONE);
1222 }
1223
1224 static struct socket *
1225 cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id)
1226 {
1227         struct socket *so = NULL;
1228         u_int64_t gencnt = cfil_sock_id >> 32;
1229         u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
1230         struct inpcb *inp = NULL;
1231         struct inpcbinfo *pcbinfo = &tcbinfo;
1232
1233         lck_rw_lock_shared(pcbinfo->ipi_lock);
1234         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1235                 if (inp->inp_state != INPCB_STATE_DEAD &&
1236                         inp->inp_socket != NULL &&
1237                         inp->inp_flowhash == flowhash &&
1238                         (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
1239                         inp->inp_socket->so_cfil != NULL) {
1240                         so = inp->inp_socket;
1241                         break;
1242                 }
1243         }
1244         lck_rw_done(pcbinfo->ipi_lock);
1245
1246         if (so == NULL) {
1247                 OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
1248                 CFIL_LOG(LOG_DEBUG,
1249                         "no socket for sock_id %llx gencnt %llx flowhash %x",
1250                         cfil_sock_id, gencnt, flowhash);
1251         }
1252
1253         return (so);
1254 }
1255
1256 static struct socket *
1257 cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
1258 {
1259         struct socket *so = NULL;
1260         struct inpcb *inp = NULL;
1261         struct inpcbinfo *pcbinfo = &tcbinfo;
1262
1263         lck_rw_lock_shared(pcbinfo->ipi_lock);
1264         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1265                 if (inp->inp_state != INPCB_STATE_DEAD &&
1266                         inp->inp_socket != NULL &&
1267                         uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
1268                         *cfil_attached = (inp->inp_socket->so_cfil != NULL);
1269                         so = inp->inp_socket;
1270                         break;
1271                 }
1272         }
1273         lck_rw_done(pcbinfo->ipi_lock);
1274
1275         return (so);
1276 }
1277
1278 static errno_t
1279 cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
1280                 int flags)
1281 {
1282 #pragma unused(kctlref, flags)
1283         errno_t error = 0;
1284         struct cfil_msg_hdr *msghdr;
1285         struct content_filter *cfc = (struct content_filter *)unitinfo;
1286         struct socket *so;
1287         struct cfil_msg_action *action_msg;
1288         struct cfil_entry *entry;
1289
1290         CFIL_LOG(LOG_INFO, "");
1291
1292         if (content_filters == NULL) {
1293                 CFIL_LOG(LOG_ERR, "no content filter");
1294                 error = EINVAL;
1295                 goto done;
1296         }
1297         if (kcunit > MAX_CONTENT_FILTER) {
1298                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1299                         kcunit, MAX_CONTENT_FILTER);
1300                 error = EINVAL;
1301                 goto done;
1302         }
1303
1304         if (m_length(m) < sizeof(struct cfil_msg_hdr)) {
1305                 CFIL_LOG(LOG_ERR, "too short %u", m_length(m));
1306                 error = EINVAL;
1307                 goto done;
1308         }
1309         msghdr = (struct cfil_msg_hdr *)mbuf_data(m);
1310         if (msghdr->cfm_version != CFM_VERSION_CURRENT) {
1311                 CFIL_LOG(LOG_ERR, "bad version %u", msghdr->cfm_version);
1312                 error = EINVAL;
1313                 goto done;
1314         }
1315         if (msghdr->cfm_type != CFM_TYPE_ACTION) {
1316                 CFIL_LOG(LOG_ERR, "bad type %u", msghdr->cfm_type);
1317                 error = EINVAL;
1318                 goto done;
1319         }
1320         /* Validate action operation */
1321         switch (msghdr->cfm_op) {
1322                 case CFM_OP_DATA_UPDATE:
1323                         OSIncrementAtomic(
1324                                 &cfil_stats.cfs_ctl_action_data_update);
1325                         break;
1326                 case CFM_OP_DROP:
1327                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_drop);
1328                         break;
1329                 case CFM_OP_BLESS_CLIENT:
1330                         if (msghdr->cfm_len != sizeof(struct cfil_msg_bless_client)) {
1331                                 OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1332                                 error = EINVAL;
1333                                 CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1334                                                  msghdr->cfm_len,
1335                                                  msghdr->cfm_op);
1336                                 goto done;
1337                         }
1338                         error = cfil_action_bless_client(kcunit, msghdr);
1339                         goto done;
1340                 default:
1341                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
1342                         CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
1343                         error = EINVAL;
1344                         goto done;
1345                 }
1346                 if (msghdr->cfm_len != sizeof(struct cfil_msg_action)) {
1347                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1348                                 error = EINVAL;
1349                                 CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1350                                         msghdr->cfm_len,
1351                                         msghdr->cfm_op);
1352                                 goto done;
1353                         }
1354         cfil_rw_lock_shared(&cfil_lck_rw);
1355         if (cfc != (void *)content_filters[kcunit - 1]) {
1356                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1357                         kcunit);
1358                 error = EINVAL;
1359                 cfil_rw_unlock_shared(&cfil_lck_rw);
1360                 goto done;
1361         }
1362
1363         so = cfil_socket_from_sock_id(msghdr->cfm_sock_id);
1364         if (so == NULL) {
1365                 CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
1366                         msghdr->cfm_sock_id);
1367                 error = EINVAL;
1368                 cfil_rw_unlock_shared(&cfil_lck_rw);
1369                 goto done;
1370         }
1371         cfil_rw_unlock_shared(&cfil_lck_rw);
1372
1373         socket_lock(so, 1);
1374
1375         if (so->so_cfil == NULL) {
1376                 CFIL_LOG(LOG_NOTICE, "so %llx not attached",
1377                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1378                 error = EINVAL;
1379                 goto unlock;
1380         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1381                 CFIL_LOG(LOG_NOTICE, "so %llx drop set",
1382                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1383                 error = EINVAL;
1384                 goto unlock;
1385         }
1386         entry = &so->so_cfil->cfi_entries[kcunit - 1];
1387         if (entry->cfe_filter == NULL) {
1388                 CFIL_LOG(LOG_NOTICE, "so %llx no filter",
1389                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1390                 error = EINVAL;
1391                 goto unlock;
1392         }
1393
1394         if (entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED)
1395                 entry->cfe_flags |= CFEF_DATA_START;
1396         else {
1397                 CFIL_LOG(LOG_ERR,
1398                         "so %llx attached not sent for %u",
1399                         (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1400                 error = EINVAL;
1401                 goto unlock;
1402         }
1403
1404         microuptime(&entry->cfe_last_action);
1405         CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_action, &so->so_cfil->cfi_first_event, msghdr->cfm_op);
1406
1407         action_msg = (struct cfil_msg_action *)msghdr;
1408
1409         switch (msghdr->cfm_op) {
1410                 case CFM_OP_DATA_UPDATE:
1411                         if (action_msg->cfa_out_peek_offset != 0 ||
1412                                 action_msg->cfa_out_pass_offset != 0)
1413                                 error = cfil_action_data_pass(so, kcunit, 1,
1414                                         action_msg->cfa_out_pass_offset,
1415                                         action_msg->cfa_out_peek_offset);
1416                         if (error == EJUSTRETURN)
1417                                 error = 0;
1418                         if (error != 0)
1419                                 break;
1420                         if (action_msg->cfa_in_peek_offset != 0 ||
1421                                 action_msg->cfa_in_pass_offset != 0)
1422                                 error = cfil_action_data_pass(so, kcunit, 0,
1423                                         action_msg->cfa_in_pass_offset,
1424                                         action_msg->cfa_in_peek_offset);
1425                         if (error == EJUSTRETURN)
1426                                 error = 0;
1427                         break;
1428
1429                 case CFM_OP_DROP:
1430                         error = cfil_action_drop(so, kcunit);
1431                         break;
1432
1433                 default:
1434                         error = EINVAL;
1435                         break;
1436         }
1437 unlock:
1438         socket_unlock(so, 1);
1439 done:
1440         mbuf_freem(m);
1441
1442         if (error == 0)
1443                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_ok);
1444         else
1445                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_bad);
1446
1447         return (error);
1448 }
1449
1450 static errno_t
1451 cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1452                 int opt, void *data, size_t *len)
1453 {
1454 #pragma unused(kctlref, opt)
1455         errno_t error = 0;
1456         struct content_filter *cfc = (struct content_filter *)unitinfo;
1457
1458         CFIL_LOG(LOG_NOTICE, "");
1459
1460         cfil_rw_lock_shared(&cfil_lck_rw);
1461
1462         if (content_filters == NULL) {
1463                 CFIL_LOG(LOG_ERR, "no content filter");
1464                 error = EINVAL;
1465                 goto done;
1466         }
1467         if (kcunit > MAX_CONTENT_FILTER) {
1468                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1469                         kcunit, MAX_CONTENT_FILTER);
1470                 error = EINVAL;
1471                 goto done;
1472         }
1473         if (cfc != (void *)content_filters[kcunit - 1]) {
1474                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1475                         kcunit);
1476                 error = EINVAL;
1477                 goto done;
1478         }
1479         switch (opt) {
1480                 case CFIL_OPT_NECP_CONTROL_UNIT:
1481                         if (*len < sizeof(uint32_t)) {
1482                                 CFIL_LOG(LOG_ERR, "len too small %lu", *len);
1483                                 error = EINVAL;
1484                                 goto done;
1485                         }
1486                         if (data != NULL) {
1487                                 *(uint32_t *)data = cfc->cf_necp_control_unit;
1488                         }
1489                         break;
1490                 case CFIL_OPT_GET_SOCKET_INFO:
1491                         if (*len != sizeof(struct cfil_opt_sock_info)) {
1492                                 CFIL_LOG(LOG_ERR, "len does not match %lu", *len);
1493                                 error = EINVAL;
1494                                 goto done;
1495                         }
1496                         if (data == NULL) {
1497                                 CFIL_LOG(LOG_ERR, "data not passed");
1498                                 error = EINVAL;
1499                                 goto done;
1500                         }
1501
1502                         struct cfil_opt_sock_info *sock_info =
1503                                                                                         (struct cfil_opt_sock_info *) data;
1504                         struct socket *sock =
1505                                                         cfil_socket_from_sock_id(sock_info->cfs_sock_id);
1506                         if (sock == NULL) {
1507                                 CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
1508                                         sock_info->cfs_sock_id);
1509                                 error = ENOENT;
1510                                 goto done;
1511                         }
1512
1513                         // Unlock here so that we never hold both cfil_lck_rw and the
1514                         // socket_lock at the same time. Otherwise, this can deadlock
1515                         // because soclose() takes the socket_lock and then exclusive
1516                         // cfil_lck_rw and we require the opposite order.
1517
1518                         // WARNING: Be sure to never use anything protected
1519                         //     by cfil_lck_rw beyond this point.
1520                         // WARNING: Be sure to avoid fallthrough and
1521                         //     goto return_already_unlocked from this branch.
1522                         cfil_rw_unlock_shared(&cfil_lck_rw);
1523
1524                         socket_lock(sock, 1);
1525
1526                         if (sock->so_cfil == NULL) {
1527                                 CFIL_LOG(LOG_NOTICE, "so %llx not attached, cannot fetch info",
1528                                         (uint64_t)VM_KERNEL_ADDRPERM(sock));
1529                                 error = EINVAL;
1530                                 socket_unlock(sock, 1);
1531                                 goto return_already_unlocked;
1532                         }
1533
1534                         // Fill out family, type, and protocol
1535                         sock_info->cfs_sock_family = sock->so_proto->pr_domain->dom_family;
1536                         sock_info->cfs_sock_type = sock->so_proto->pr_type;
1537                         sock_info->cfs_sock_protocol = sock->so_proto->pr_protocol;
1538
1539                         // Source and destination addresses
1540                         struct inpcb *inp = sotoinpcb(sock);
1541                         if (inp->inp_vflag & INP_IPV6) {
1542                                 fill_ip6_sockaddr_4_6(&sock_info->cfs_local,
1543                                         &inp->in6p_laddr, inp->inp_lport);
1544                                 fill_ip6_sockaddr_4_6(&sock_info->cfs_remote,
1545                                         &inp->in6p_faddr, inp->inp_fport);
1546                         } else if (inp->inp_vflag & INP_IPV4) {
1547                                 fill_ip_sockaddr_4_6(&sock_info->cfs_local,
1548                                         inp->inp_laddr, inp->inp_lport);
1549                                 fill_ip_sockaddr_4_6(&sock_info->cfs_remote,
1550                                         inp->inp_faddr, inp->inp_fport);
1551                         }
1552
1553                         // Set the pid info
1554                         sock_info->cfs_pid = sock->last_pid;
1555                         memcpy(sock_info->cfs_uuid, sock->last_uuid, sizeof(uuid_t));
1556
1557                         if (sock->so_flags & SOF_DELEGATED) {
1558                                 sock_info->cfs_e_pid = sock->e_pid;
1559                                 memcpy(sock_info->cfs_e_uuid, sock->e_uuid, sizeof(uuid_t));
1560                         } else {
1561                                 sock_info->cfs_e_pid = sock->last_pid;
1562                                 memcpy(sock_info->cfs_e_uuid, sock->last_uuid, sizeof(uuid_t));
1563                         }
1564
1565                         socket_unlock(sock, 1);
1566
1567                         goto return_already_unlocked;
1568                 default:
1569                         error = ENOPROTOOPT;
1570                         break;
1571         }
1572 done:
1573         cfil_rw_unlock_shared(&cfil_lck_rw);
1574
1575         return (error);
1576
1577 return_already_unlocked:
1578
1579         return (error);
1580 }
1581
1582 static errno_t
1583 cfil_ctl_setopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1584                 int opt, void *data, size_t len)
1585 {
1586 #pragma unused(kctlref, opt)
1587         errno_t error = 0;
1588         struct content_filter *cfc = (struct content_filter *)unitinfo;
1589
1590         CFIL_LOG(LOG_NOTICE, "");
1591
1592         cfil_rw_lock_exclusive(&cfil_lck_rw);
1593
1594         if (content_filters == NULL) {
1595                 CFIL_LOG(LOG_ERR, "no content filter");
1596                 error = EINVAL;
1597                 goto done;
1598         }
1599         if (kcunit > MAX_CONTENT_FILTER) {
1600                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1601                         kcunit, MAX_CONTENT_FILTER);
1602                 error = EINVAL;
1603                 goto done;
1604         }
1605         if (cfc != (void *)content_filters[kcunit - 1]) {
1606                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1607                         kcunit);
1608                 error = EINVAL;
1609                 goto done;
1610         }
1611         switch (opt) {
1612                 case CFIL_OPT_NECP_CONTROL_UNIT:
1613                         if (len < sizeof(uint32_t)) {
1614                                 CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1615                                         "len too small %lu", len);
1616                                 error = EINVAL;
1617                                 goto done;
1618                         }
1619                         if (cfc->cf_necp_control_unit != 0) {
1620                                 CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1621                                         "already set %u",
1622                                         cfc->cf_necp_control_unit);
1623                                 error = EINVAL;
1624                                 goto done;
1625                         }
1626                         cfc->cf_necp_control_unit = *(uint32_t *)data;
1627                         break;
1628                 default:
1629                         error = ENOPROTOOPT;
1630                         break;
1631         }
1632 done:
1633         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1634
1635         return (error);
1636 }
1637
1638
1639 static void
1640 cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
1641 {
1642 #pragma unused(kctlref, flags)
1643         struct content_filter *cfc = (struct content_filter *)unitinfo;
1644         struct socket *so = NULL;
1645         int error;
1646         struct cfil_entry *entry;
1647
1648         CFIL_LOG(LOG_INFO, "");
1649
1650         if (content_filters == NULL) {
1651                 CFIL_LOG(LOG_ERR, "no content filter");
1652                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1653                 return;
1654         }
1655         if (kcunit > MAX_CONTENT_FILTER) {
1656                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1657                         kcunit, MAX_CONTENT_FILTER);
1658                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1659                 return;
1660         }
1661         cfil_rw_lock_shared(&cfil_lck_rw);
1662         if (cfc != (void *)content_filters[kcunit - 1]) {
1663                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1664                         kcunit);
1665                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1666                 goto done;
1667         }
1668         /* Let's assume the flow control is lifted */
1669         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
1670                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
1671                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1672
1673         cfc->cf_flags &= ~CFF_FLOW_CONTROLLED;
1674
1675                 cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
1676                 LCK_RW_ASSERT(&cfil_lck_rw, LCK_RW_ASSERT_SHARED);
1677         }
1678         /*
1679          * Flow control will be raised again as soon as an entry cannot enqueue
1680          * to the kernel control socket
1681          */
1682         while ((cfc->cf_flags & CFF_FLOW_CONTROLLED) == 0) {
1683                 verify_content_filter(cfc);
1684
1685                 cfil_rw_lock_assert_held(&cfil_lck_rw, 0);
1686
1687                 /* Find an entry that is flow controlled */
1688                 TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
1689                         if (entry->cfe_cfil_info == NULL ||
1690                                 entry->cfe_cfil_info->cfi_so == NULL)
1691                                 continue;
1692                         if ((entry->cfe_flags & CFEF_FLOW_CONTROLLED) == 0)
1693                                 continue;
1694                 }
1695                 if (entry == NULL)
1696                         break;
1697
1698                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
1699
1700                 so = entry->cfe_cfil_info->cfi_so;
1701
1702                 cfil_rw_unlock_shared(&cfil_lck_rw);
1703                 socket_lock(so, 1);
1704
1705                 do {
1706                         error = cfil_acquire_sockbuf(so, 1);
1707                         if (error == 0)
1708                                 error = cfil_data_service_ctl_q(so, kcunit, 1);
1709                         cfil_release_sockbuf(so, 1);
1710                         if (error != 0)
1711                                 break;
1712
1713                         error = cfil_acquire_sockbuf(so, 0);
1714                         if (error == 0)
1715                                 error = cfil_data_service_ctl_q(so, kcunit, 0);
1716                         cfil_release_sockbuf(so, 0);
1717                 } while (0);
1718
1719                 socket_lock_assert_owned(so);
1720                 socket_unlock(so, 1);
1721
1722                 cfil_rw_lock_shared(&cfil_lck_rw);
1723         }
1724 done:
1725         cfil_rw_unlock_shared(&cfil_lck_rw);
1726 }
1727
1728 void
1729 cfil_init(void)
1730 {
1731         struct kern_ctl_reg kern_ctl;
1732         errno_t error = 0;
1733         vm_size_t content_filter_size = 0;      /* size of content_filter */
1734         vm_size_t cfil_info_size = 0;   /* size of cfil_info */
1735
1736         CFIL_LOG(LOG_NOTICE, "");
1737
1738         /*
1739          * Compile time verifications
1740          */
1741         _CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER);
1742         _CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0);
1743         _CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0);
1744         _CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0);
1745
1746         /*
1747          * Runtime time verifications
1748          */
1749         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_enqueued,
1750                 sizeof(uint32_t)));
1751         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_enqueued,
1752                 sizeof(uint32_t)));
1753         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_peeked,
1754                 sizeof(uint32_t)));
1755         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_peeked,
1756                 sizeof(uint32_t)));
1757
1758         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_in_enqueued,
1759                 sizeof(uint32_t)));
1760         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_out_enqueued,
1761                 sizeof(uint32_t)));
1762
1763         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_enqueued,
1764                 sizeof(uint32_t)));
1765         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_enqueued,
1766                 sizeof(uint32_t)));
1767         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_passed,
1768                 sizeof(uint32_t)));
1769         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_passed,
1770                 sizeof(uint32_t)));
1771
1772         /*
1773          * Zone for content filters kernel control sockets
1774          */
1775         content_filter_size = sizeof(struct content_filter);
1776         content_filter_zone = zinit(content_filter_size,
1777                                 CONTENT_FILTER_ZONE_MAX * content_filter_size,
1778                                 0,
1779                                 CONTENT_FILTER_ZONE_NAME);
1780         if (content_filter_zone == NULL) {
1781                 panic("%s: zinit(%s) failed", __func__,
1782                         CONTENT_FILTER_ZONE_NAME);
1783                 /* NOTREACHED */
1784         }
1785         zone_change(content_filter_zone, Z_CALLERACCT, FALSE);
1786         zone_change(content_filter_zone, Z_EXPAND, TRUE);
1787
1788         /*
1789          * Zone for per socket content filters
1790          */
1791         cfil_info_size = sizeof(struct cfil_info);
1792         cfil_info_zone = zinit(cfil_info_size,
1793                                 CFIL_INFO_ZONE_MAX * cfil_info_size,
1794                                 0,
1795                                 CFIL_INFO_ZONE_NAME);
1796         if (cfil_info_zone == NULL) {
1797                 panic("%s: zinit(%s) failed", __func__, CFIL_INFO_ZONE_NAME);
1798                 /* NOTREACHED */
1799         }
1800         zone_change(cfil_info_zone, Z_CALLERACCT, FALSE);
1801         zone_change(cfil_info_zone, Z_EXPAND, TRUE);
1802
1803         /*
1804          * Allocate locks
1805          */
1806         cfil_lck_grp_attr = lck_grp_attr_alloc_init();
1807         if (cfil_lck_grp_attr == NULL) {
1808                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1809                 /* NOTREACHED */
1810         }
1811         cfil_lck_grp = lck_grp_alloc_init("content filter",
1812                                         cfil_lck_grp_attr);
1813         if (cfil_lck_grp == NULL) {
1814                 panic("%s: lck_grp_alloc_init failed", __func__);
1815                 /* NOTREACHED */
1816         }
1817         cfil_lck_attr = lck_attr_alloc_init();
1818         if (cfil_lck_attr == NULL) {
1819                 panic("%s: lck_attr_alloc_init failed", __func__);
1820                 /* NOTREACHED */
1821         }
1822         lck_rw_init(&cfil_lck_rw, cfil_lck_grp, cfil_lck_attr);
1823
1824         TAILQ_INIT(&cfil_sock_head);
1825
1826         /*
1827          * Register kernel control
1828          */
1829         bzero(&kern_ctl, sizeof(kern_ctl));
1830         strlcpy(kern_ctl.ctl_name, CONTENT_FILTER_CONTROL_NAME,
1831                 sizeof(kern_ctl.ctl_name));
1832         kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_EXTENDED;
1833         kern_ctl.ctl_sendsize = 512 * 1024; /* enough? */
1834         kern_ctl.ctl_recvsize = 512 * 1024; /* enough? */
1835         kern_ctl.ctl_connect = cfil_ctl_connect;
1836         kern_ctl.ctl_disconnect = cfil_ctl_disconnect;
1837         kern_ctl.ctl_send = cfil_ctl_send;
1838         kern_ctl.ctl_getopt = cfil_ctl_getopt;
1839         kern_ctl.ctl_setopt = cfil_ctl_setopt;
1840         kern_ctl.ctl_rcvd = cfil_ctl_rcvd;
1841         error = ctl_register(&kern_ctl, &cfil_kctlref);
1842         if (error != 0) {
1843                 CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
1844                 return;
1845         }
1846 }
1847
1848 struct cfil_info *
1849 cfil_info_alloc(struct socket *so)
1850 {
1851         int kcunit;
1852         struct cfil_info *cfil_info = NULL;
1853         struct inpcb *inp = sotoinpcb(so);
1854
1855         CFIL_LOG(LOG_INFO, "");
1856
1857         socket_lock_assert_owned(so);
1858
1859         cfil_info = zalloc(cfil_info_zone);
1860         if (cfil_info == NULL)
1861                 goto done;
1862         bzero(cfil_info, sizeof(struct cfil_info));
1863
1864         cfil_queue_init(&cfil_info->cfi_snd.cfi_inject_q);
1865         cfil_queue_init(&cfil_info->cfi_rcv.cfi_inject_q);
1866
1867         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1868                 struct cfil_entry *entry;
1869
1870                 entry = &cfil_info->cfi_entries[kcunit - 1];
1871                 entry->cfe_cfil_info = cfil_info;
1872
1873                 /* Initialize the filter entry */
1874                 entry->cfe_filter = NULL;
1875                 entry->cfe_flags = 0;
1876                 entry->cfe_necp_control_unit = 0;
1877                 entry->cfe_snd.cfe_pass_offset = 0;
1878                 entry->cfe_snd.cfe_peek_offset = 0;
1879                 entry->cfe_snd.cfe_peeked = 0;
1880                 entry->cfe_rcv.cfe_pass_offset = 0;
1881                 entry->cfe_rcv.cfe_peek_offset = 0;
1882                 entry->cfe_rcv.cfe_peeked = 0;
1883
1884                 cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
1885                 cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
1886                 cfil_queue_init(&entry->cfe_snd.cfe_ctl_q);
1887                 cfil_queue_init(&entry->cfe_rcv.cfe_ctl_q);
1888         }
1889
1890         cfil_rw_lock_exclusive(&cfil_lck_rw);
1891
1892         so->so_cfil = cfil_info;
1893         cfil_info->cfi_so = so;
1894         /*
1895          * Create a cfi_sock_id that's not the socket pointer!
1896          */
1897         if (inp->inp_flowhash == 0)
1898                 inp->inp_flowhash = inp_calc_flowhash(inp);
1899         cfil_info->cfi_sock_id =
1900                 ((so->so_gencnt << 32) | inp->inp_flowhash);
1901
1902         TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
1903
1904         cfil_sock_attached_count++;
1905
1906         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1907
1908 done:
1909         if (cfil_info != NULL)
1910                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_ok);
1911         else
1912                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_fail);
1913
1914         return (cfil_info);
1915 }
1916
1917 int
1918 cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit)
1919 {
1920         int kcunit;
1921         struct cfil_info *cfil_info = so->so_cfil;
1922         int attached = 0;
1923
1924         CFIL_LOG(LOG_INFO, "");
1925
1926         socket_lock_assert_owned(so);
1927
1928         cfil_rw_lock_exclusive(&cfil_lck_rw);
1929
1930         for (kcunit = 1;
1931                 content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1932                 kcunit++) {
1933                 struct content_filter *cfc = content_filters[kcunit - 1];
1934                 struct cfil_entry *entry;
1935
1936                 if (cfc == NULL)
1937                         continue;
1938                 if (cfc->cf_necp_control_unit != filter_control_unit)
1939                         continue;
1940
1941                 entry = &cfil_info->cfi_entries[kcunit - 1];
1942
1943                 entry->cfe_filter = cfc;
1944                 entry->cfe_necp_control_unit = filter_control_unit;
1945                 TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
1946                 cfc->cf_sock_count++;
1947                 verify_content_filter(cfc);
1948                 attached = 1;
1949                 entry->cfe_flags |= CFEF_CFIL_ATTACHED;
1950                 break;
1951         }
1952
1953         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1954
1955         return (attached);
1956 }
1957
1958 static void
1959 cfil_info_free(struct socket *so, struct cfil_info *cfil_info)
1960 {
1961         int kcunit;
1962         uint64_t in_drain = 0;
1963         uint64_t out_drained = 0;
1964
1965         so->so_cfil = NULL;
1966
1967         if (so->so_flags & SOF_CONTENT_FILTER) {
1968                 so->so_flags &= ~SOF_CONTENT_FILTER;
1969                 VERIFY(so->so_usecount > 0);
1970                 so->so_usecount--;
1971         }
1972         if (cfil_info == NULL)
1973                 return;
1974
1975         CFIL_LOG(LOG_INFO, "");
1976
1977         cfil_rw_lock_exclusive(&cfil_lck_rw);
1978
1979         for (kcunit = 1;
1980                 content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1981                 kcunit++) {
1982                 struct cfil_entry *entry;
1983                 struct content_filter *cfc;
1984
1985                 entry = &cfil_info->cfi_entries[kcunit - 1];
1986
1987                 /* Don't be silly and try to detach twice */
1988                 if (entry->cfe_filter == NULL)
1989                         continue;
1990
1991                 cfc = content_filters[kcunit - 1];
1992
1993                 VERIFY(cfc == entry->cfe_filter);
1994
1995                 entry->cfe_filter = NULL;
1996                 entry->cfe_necp_control_unit = 0;
1997                 TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1998                 cfc->cf_sock_count--;
1999
2000                 verify_content_filter(cfc);
2001         }
2002         cfil_sock_attached_count--;
2003         TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
2004
2005         out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
2006         in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
2007
2008         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2009                 struct cfil_entry *entry;
2010
2011                 entry = &cfil_info->cfi_entries[kcunit - 1];
2012                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
2013                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_pending_q);
2014                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
2015                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_ctl_q);
2016         }
2017         cfil_rw_unlock_exclusive(&cfil_lck_rw);
2018
2019         if (out_drained)
2020                 OSIncrementAtomic(&cfil_stats.cfs_flush_out_free);
2021         if (in_drain)
2022                 OSIncrementAtomic(&cfil_stats.cfs_flush_in_free);
2023
2024         zfree(cfil_info_zone, cfil_info);
2025 }
2026
2027 /*
2028  * Entry point from Sockets layer
2029  * The socket is locked.
2030  */
2031 errno_t
2032 cfil_sock_attach(struct socket *so)
2033 {
2034         errno_t error = 0;
2035         uint32_t filter_control_unit;
2036
2037         socket_lock_assert_owned(so);
2038
2039         /* Limit ourselves to TCP that are not MPTCP subflows */
2040         if ((so->so_proto->pr_domain->dom_family != PF_INET &&
2041                 so->so_proto->pr_domain->dom_family != PF_INET6) ||
2042                 so->so_proto->pr_type != SOCK_STREAM ||
2043                 so->so_proto->pr_protocol != IPPROTO_TCP ||
2044                 (so->so_flags & SOF_MP_SUBFLOW) != 0 ||
2045                 (so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0)
2046                 goto done;
2047
2048         filter_control_unit = necp_socket_get_content_filter_control_unit(so);
2049         if (filter_control_unit == 0)
2050                 goto done;
2051
2052         if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
2053                 OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
2054                 goto done;
2055         }
2056         if (cfil_active_count == 0) {
2057                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
2058                 goto done;
2059         }
2060         if (so->so_cfil != NULL) {
2061                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
2062                 CFIL_LOG(LOG_ERR, "already attached");
2063         } else {
2064                 cfil_info_alloc(so);
2065                 if (so->so_cfil == NULL) {
2066                         error = ENOMEM;
2067                         OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
2068                         goto done;
2069                 }
2070         }
2071         if (cfil_info_attach_unit(so, filter_control_unit) == 0) {
2072                 CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
2073                         filter_control_unit);
2074                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
2075                 goto done;
2076         }
2077         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockid %llx",
2078                 (uint64_t)VM_KERNEL_ADDRPERM(so),
2079                 filter_control_unit, so->so_cfil->cfi_sock_id);
2080
2081         so->so_flags |= SOF_CONTENT_FILTER;
2082         OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
2083
2084         /* Hold a reference on the socket */
2085         so->so_usecount++;
2086
2087         error = cfil_dispatch_attach_event(so, filter_control_unit);
2088         /* We can recover from flow control or out of memory errors */
2089         if (error == ENOBUFS || error == ENOMEM)
2090                 error = 0;
2091         else if (error != 0)
2092                 goto done;
2093
2094         CFIL_INFO_VERIFY(so->so_cfil);
2095 done:
2096         return (error);
2097 }
2098
2099 /*
2100  * Entry point from Sockets layer
2101  * The socket is locked.
2102  */
2103 errno_t
2104 cfil_sock_detach(struct socket *so)
2105 {
2106         if (so->so_cfil) {
2107                 cfil_info_free(so, so->so_cfil);
2108                 OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
2109         }
2110         return (0);
2111 }
2112
2113 static int
2114 cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
2115 {
2116         errno_t error = 0;
2117         struct cfil_entry *entry = NULL;
2118         struct cfil_msg_sock_attached msg_attached;
2119         uint32_t kcunit;
2120         struct content_filter *cfc = NULL;
2121
2122         socket_lock_assert_owned(so);
2123
2124         cfil_rw_lock_shared(&cfil_lck_rw);
2125
2126         if (so->so_proto == NULL || so->so_proto->pr_domain == NULL) {
2127                 error = EINVAL;
2128                 goto done;
2129         }
2130         /*
2131          * Find the matching filter unit
2132          */
2133         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2134                 cfc = content_filters[kcunit - 1];
2135
2136                 if (cfc == NULL)
2137                         continue;
2138                 if (cfc->cf_necp_control_unit != filter_control_unit)
2139                         continue;
2140                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
2141                 if (entry->cfe_filter == NULL)
2142                         continue;
2143
2144                 VERIFY(cfc == entry->cfe_filter);
2145
2146                 break;
2147         }
2148
2149         if (entry == NULL || entry->cfe_filter == NULL)
2150                 goto done;
2151
2152         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED))
2153                 goto done;
2154
2155         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
2156                 (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit);
2157
2158         /* Would be wasteful to try when flow controlled */
2159         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2160                 error = ENOBUFS;
2161                 goto done;
2162         }
2163
2164         bzero(&msg_attached, sizeof(struct cfil_msg_sock_attached));
2165         msg_attached.cfs_msghdr.cfm_len = sizeof(struct cfil_msg_sock_attached);
2166         msg_attached.cfs_msghdr.cfm_version = CFM_VERSION_CURRENT;
2167         msg_attached.cfs_msghdr.cfm_type = CFM_TYPE_EVENT;
2168         msg_attached.cfs_msghdr.cfm_op = CFM_OP_SOCKET_ATTACHED;
2169         msg_attached.cfs_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2170
2171         msg_attached.cfs_sock_family = so->so_proto->pr_domain->dom_family;
2172         msg_attached.cfs_sock_type = so->so_proto->pr_type;
2173         msg_attached.cfs_sock_protocol = so->so_proto->pr_protocol;
2174         msg_attached.cfs_pid = so->last_pid;
2175         memcpy(msg_attached.cfs_uuid, so->last_uuid, sizeof(uuid_t));
2176         if (so->so_flags & SOF_DELEGATED) {
2177                 msg_attached.cfs_e_pid = so->e_pid;
2178                 memcpy(msg_attached.cfs_e_uuid, so->e_uuid, sizeof(uuid_t));
2179         } else {
2180                 msg_attached.cfs_e_pid = so->last_pid;
2181                 memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
2182         }
2183         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2184                                 entry->cfe_filter->cf_kcunit,
2185                                 &msg_attached,
2186                                 sizeof(struct cfil_msg_sock_attached),
2187                                 CTL_DATA_EOR);
2188         if (error != 0) {
2189                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error);
2190                 goto done;
2191         }
2192         microuptime(&entry->cfe_last_event);
2193         so->so_cfil->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec;
2194         so->so_cfil->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec;
2195
2196         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
2197         OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
2198 done:
2199
2200         /* We can recover from flow control */
2201         if (error == ENOBUFS) {
2202                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2203                 OSIncrementAtomic(&cfil_stats.cfs_attach_event_flow_control);
2204
2205                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2206                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2207
2208                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2209
2210                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2211         } else {
2212                 if (error != 0)
2213                         OSIncrementAtomic(&cfil_stats.cfs_attach_event_fail);
2214
2215                 cfil_rw_unlock_shared(&cfil_lck_rw);
2216         }
2217         return (error);
2218 }
2219
2220 static int
2221 cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
2222 {
2223         errno_t error = 0;
2224         struct mbuf *msg = NULL;
2225         struct cfil_entry *entry;
2226         struct cfe_buf *entrybuf;
2227         struct cfil_msg_hdr msg_disconnected;
2228         struct content_filter *cfc;
2229
2230         socket_lock_assert_owned(so);
2231
2232         cfil_rw_lock_shared(&cfil_lck_rw);
2233
2234         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2235         if (outgoing)
2236                 entrybuf = &entry->cfe_snd;
2237         else
2238                 entrybuf = &entry->cfe_rcv;
2239
2240         cfc = entry->cfe_filter;
2241         if (cfc == NULL)
2242                 goto done;
2243
2244         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2245                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2246
2247         /*
2248          * Send the disconnection event once
2249          */
2250         if ((outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) ||
2251                 (!outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))) {
2252                 CFIL_LOG(LOG_INFO, "so %llx disconnect already sent",
2253                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2254                 goto done;
2255         }
2256
2257         /*
2258          * We're not disconnected as long as some data is waiting
2259          * to be delivered to the filter
2260          */
2261         if (outgoing && cfil_queue_empty(&entrybuf->cfe_ctl_q) == 0) {
2262                 CFIL_LOG(LOG_INFO, "so %llx control queue not empty",
2263                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2264                 error = EBUSY;
2265                 goto done;
2266         }
2267         /* Would be wasteful to try when flow controlled */
2268         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2269                 error = ENOBUFS;
2270                 goto done;
2271         }
2272
2273         bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
2274         msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
2275         msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
2276         msg_disconnected.cfm_type = CFM_TYPE_EVENT;
2277         msg_disconnected.cfm_op = outgoing ? CFM_OP_DISCONNECT_OUT :
2278                 CFM_OP_DISCONNECT_IN;
2279         msg_disconnected.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2280         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2281                                 entry->cfe_filter->cf_kcunit,
2282                                 &msg_disconnected,
2283                                 sizeof(struct cfil_msg_hdr),
2284                                 CTL_DATA_EOR);
2285         if (error != 0) {
2286                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2287                 mbuf_freem(msg);
2288                 goto done;
2289         }
2290         microuptime(&entry->cfe_last_event);
2291         CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, msg_disconnected.cfm_op);
2292
2293         /* Remember we have sent the disconnection message */
2294         if (outgoing) {
2295                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_OUT;
2296                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_out_event_ok);
2297         } else {
2298                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_IN;
2299                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_in_event_ok);
2300         }
2301 done:
2302         if (error == ENOBUFS) {
2303                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2304                 OSIncrementAtomic(
2305                         &cfil_stats.cfs_disconnect_event_flow_control);
2306
2307                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2308                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2309
2310                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2311
2312                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2313         } else {
2314                 if (error != 0)
2315                         OSIncrementAtomic(
2316                                 &cfil_stats.cfs_disconnect_event_fail);
2317
2318                 cfil_rw_unlock_shared(&cfil_lck_rw);
2319         }
2320         return (error);
2321 }
2322
2323 int
2324 cfil_dispatch_closed_event(struct socket *so, int kcunit)
2325 {
2326         struct cfil_entry *entry;
2327         struct cfil_msg_sock_closed msg_closed;
2328         errno_t error = 0;
2329         struct content_filter *cfc;
2330
2331         socket_lock_assert_owned(so);
2332
2333         cfil_rw_lock_shared(&cfil_lck_rw);
2334
2335         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2336         cfc = entry->cfe_filter;
2337         if (cfc == NULL)
2338                 goto done;
2339
2340         CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
2341                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
2342
2343         /* Would be wasteful to try when flow controlled */
2344         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2345                 error = ENOBUFS;
2346                 goto done;
2347         }
2348         /*
2349          * Send a single closed message per filter
2350          */
2351         if ((entry->cfe_flags & CFEF_SENT_SOCK_CLOSED) != 0)
2352                 goto done;
2353         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
2354                 goto done;
2355
2356         microuptime(&entry->cfe_last_event);
2357         CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, CFM_OP_SOCKET_CLOSED);
2358
2359         bzero(&msg_closed, sizeof(struct cfil_msg_sock_closed));
2360         msg_closed.cfc_msghdr.cfm_len = sizeof(struct cfil_msg_sock_closed);
2361         msg_closed.cfc_msghdr.cfm_version = CFM_VERSION_CURRENT;
2362         msg_closed.cfc_msghdr.cfm_type = CFM_TYPE_EVENT;
2363         msg_closed.cfc_msghdr.cfm_op = CFM_OP_SOCKET_CLOSED;
2364         msg_closed.cfc_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2365         msg_closed.cfc_first_event.tv_sec = so->so_cfil->cfi_first_event.tv_sec;
2366         msg_closed.cfc_first_event.tv_usec = so->so_cfil->cfi_first_event.tv_usec;
2367         memcpy(msg_closed.cfc_op_time, so->so_cfil->cfi_op_time, sizeof(uint32_t)*CFI_MAX_TIME_LOG_ENTRY);
2368         memcpy(msg_closed.cfc_op_list, so->so_cfil->cfi_op_list, sizeof(unsigned char)*CFI_MAX_TIME_LOG_ENTRY);
2369         msg_closed.cfc_op_list_ctr = so->so_cfil->cfi_op_list_ctr;
2370
2371         CFIL_LOG(LOG_INFO, "sock id %llu, op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, so->so_cfil->cfi_op_list_ctr, so->so_cfil->cfi_first_event.tv_sec, so->so_cfil->cfi_first_event.tv_usec);
2372         /* for debugging
2373         if (msg_closed.cfc_op_list_ctr > CFI_MAX_TIME_LOG_ENTRY) {
2374                 msg_closed.cfc_op_list_ctr  = CFI_MAX_TIME_LOG_ENTRY;       // just in case
2375         }
2376         for (unsigned int i = 0; i < msg_closed.cfc_op_list_ctr ; i++) {
2377                 CFIL_LOG(LOG_ERR, "MD: socket %llu event %2u, time + %u msec", msg_closed.cfc_msghdr.cfm_sock_id, (unsigned short)msg_closed.cfc_op_list[i], msg_closed.cfc_op_time[i]);
2378         }
2379         */
2380
2381         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2382                                 entry->cfe_filter->cf_kcunit,
2383                                 &msg_closed,
2384                                 sizeof(struct cfil_msg_sock_closed),
2385                                 CTL_DATA_EOR);
2386         if (error != 0) {
2387                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d",
2388                         error);
2389                 goto done;
2390         }
2391
2392         entry->cfe_flags |= CFEF_SENT_SOCK_CLOSED;
2393         OSIncrementAtomic(&cfil_stats.cfs_closed_event_ok);
2394 done:
2395         /* We can recover from flow control */
2396         if (error == ENOBUFS) {
2397                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2398                 OSIncrementAtomic(&cfil_stats.cfs_closed_event_flow_control);
2399
2400                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2401                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2402
2403                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2404
2405                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2406         } else {
2407                 if (error != 0)
2408                         OSIncrementAtomic(&cfil_stats.cfs_closed_event_fail);
2409
2410                 cfil_rw_unlock_shared(&cfil_lck_rw);
2411         }
2412
2413         return (error);
2414 }
2415
2416 static void
2417 fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2418         struct in6_addr *ip6, u_int16_t port)
2419 {
2420         struct sockaddr_in6 *sin6 = &sin46->sin6;
2421
2422         sin6->sin6_family = AF_INET6;
2423         sin6->sin6_len = sizeof(*sin6);
2424         sin6->sin6_port = port;
2425         sin6->sin6_addr = *ip6;
2426         if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) {
2427                 sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]);
2428                 sin6->sin6_addr.s6_addr16[1] = 0;
2429         }
2430 }
2431
2432 static void
2433 fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2434         struct in_addr ip, u_int16_t port)
2435 {
2436         struct sockaddr_in *sin = &sin46->sin;
2437
2438         sin->sin_family = AF_INET;
2439         sin->sin_len = sizeof(*sin);
2440         sin->sin_port = port;
2441         sin->sin_addr.s_addr = ip.s_addr;
2442 }
2443
2444 static int
2445 cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
2446         struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
2447 {
2448         errno_t error = 0;
2449         struct mbuf *copy = NULL;
2450         struct mbuf *msg = NULL;
2451         unsigned int one = 1;
2452         struct cfil_msg_data_event *data_req;
2453         size_t hdrsize;
2454         struct inpcb *inp = (struct inpcb *)so->so_pcb;
2455         struct cfil_entry *entry;
2456         struct cfe_buf *entrybuf;
2457         struct content_filter *cfc;
2458         struct timeval tv;
2459
2460         cfil_rw_lock_shared(&cfil_lck_rw);
2461
2462         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2463         if (outgoing)
2464                 entrybuf = &entry->cfe_snd;
2465         else
2466                 entrybuf = &entry->cfe_rcv;
2467
2468         cfc = entry->cfe_filter;
2469         if (cfc == NULL)
2470                 goto done;
2471
2472         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2473                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2474
2475         socket_lock_assert_owned(so);
2476
2477         /* Would be wasteful to try */
2478         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2479                 error = ENOBUFS;
2480                 goto done;
2481         }
2482
2483         /* Make a copy of the data to pass to kernel control socket */
2484         copy = m_copym_mode(data, copyoffset, copylen, M_DONTWAIT,
2485                 M_COPYM_NOOP_HDR);
2486         if (copy == NULL) {
2487                 CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2488                 error = ENOMEM;
2489                 goto done;
2490         }
2491
2492         /* We need an mbuf packet for the message header */
2493         hdrsize = sizeof(struct cfil_msg_data_event);
2494         error = mbuf_allocpacket(MBUF_DONTWAIT, hdrsize, &one, &msg);
2495         if (error != 0) {
2496                 CFIL_LOG(LOG_ERR, "mbuf_allocpacket() failed");
2497                 m_freem(copy);
2498                 /*
2499                  * ENOBUFS is to indicate flow control
2500                  */
2501                 error = ENOMEM;
2502                 goto done;
2503         }
2504         mbuf_setlen(msg, hdrsize);
2505         mbuf_pkthdr_setlen(msg, hdrsize + copylen);
2506         msg->m_next = copy;
2507         data_req = (struct cfil_msg_data_event *)mbuf_data(msg);
2508         bzero(data_req, hdrsize);
2509         data_req->cfd_msghdr.cfm_len = hdrsize + copylen;
2510         data_req->cfd_msghdr.cfm_version = 1;
2511         data_req->cfd_msghdr.cfm_type = CFM_TYPE_EVENT;
2512         data_req->cfd_msghdr.cfm_op =
2513                 outgoing ? CFM_OP_DATA_OUT : CFM_OP_DATA_IN;
2514         data_req->cfd_msghdr.cfm_sock_id =
2515                 entry->cfe_cfil_info->cfi_sock_id;
2516         data_req->cfd_start_offset = entrybuf->cfe_peeked;
2517         data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
2518
2519         /*
2520          * TBD:
2521          * For non connected sockets need to copy addresses from passed
2522          * parameters
2523          */
2524         if (inp->inp_vflag & INP_IPV6) {
2525                 if (outgoing) {
2526                         fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2527                                 &inp->in6p_laddr, inp->inp_lport);
2528                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2529                                 &inp->in6p_faddr, inp->inp_fport);
2530                 } else {
2531                         fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2532                                 &inp->in6p_faddr, inp->inp_fport);
2533                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2534                                 &inp->in6p_laddr, inp->inp_lport);
2535                 }
2536         } else if (inp->inp_vflag & INP_IPV4) {
2537                 if (outgoing) {
2538                         fill_ip_sockaddr_4_6(&data_req->cfc_src,
2539                                 inp->inp_laddr, inp->inp_lport);
2540                         fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2541                                 inp->inp_faddr, inp->inp_fport);
2542                 } else {
2543                         fill_ip_sockaddr_4_6(&data_req->cfc_src,
2544                                 inp->inp_faddr, inp->inp_fport);
2545                         fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2546                                 inp->inp_laddr, inp->inp_lport);
2547                 }
2548         }
2549
2550         microuptime(&tv);
2551         CFI_ADD_TIME_LOG(so->so_cfil, &tv, &so->so_cfil->cfi_first_event, data_req->cfd_msghdr.cfm_op);
2552
2553         /* Pass the message to the content filter */
2554         error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
2555                                 entry->cfe_filter->cf_kcunit,
2556                                 msg, CTL_DATA_EOR);
2557         if (error != 0) {
2558                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2559                 mbuf_freem(msg);
2560                 goto done;
2561         }
2562         entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
2563         OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
2564 done:
2565         if (error == ENOBUFS) {
2566                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2567                 OSIncrementAtomic(
2568                         &cfil_stats.cfs_data_event_flow_control);
2569
2570                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2571                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2572
2573                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2574
2575                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2576         } else {
2577                 if (error != 0)
2578                         OSIncrementAtomic(&cfil_stats.cfs_data_event_fail);
2579
2580                 cfil_rw_unlock_shared(&cfil_lck_rw);
2581         }
2582         return (error);
2583 }
2584
2585 /*
2586  * Process the queue of data waiting to be delivered to content filter
2587  */
2588 static int
2589 cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
2590 {
2591         errno_t error = 0;
2592         struct mbuf *data, *tmp = NULL;
2593         unsigned int datalen = 0, copylen = 0, copyoffset = 0;
2594         struct cfil_entry *entry;
2595         struct cfe_buf *entrybuf;
2596         uint64_t currentoffset = 0;
2597
2598         if (so->so_cfil == NULL)
2599                 return (0);
2600
2601         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2602                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2603
2604         socket_lock_assert_owned(so);
2605
2606         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2607         if (outgoing)
2608                 entrybuf = &entry->cfe_snd;
2609         else
2610                 entrybuf = &entry->cfe_rcv;
2611
2612         /* Send attached message if not yet done */
2613         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
2614                 error = cfil_dispatch_attach_event(so, kcunit);
2615                 if (error != 0) {
2616                         /* We can recover from flow control */
2617                         if (error == ENOBUFS || error == ENOMEM)
2618                                 error = 0;
2619                         goto done;
2620                 }
2621         } else if ((entry->cfe_flags & CFEF_DATA_START) == 0) {
2622                 OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
2623                 goto done;
2624         }
2625         CFIL_LOG(LOG_DEBUG, "pass_offset %llu peeked %llu peek_offset %llu",
2626                 entrybuf->cfe_pass_offset,
2627                 entrybuf->cfe_peeked,
2628                 entrybuf->cfe_peek_offset);
2629
2630         /* Move all data that can pass */
2631         while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
2632                 entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
2633                 datalen = cfil_data_length(data, NULL);
2634                 tmp = data;
2635
2636                 if (entrybuf->cfe_ctl_q.q_start + datalen <=
2637                         entrybuf->cfe_pass_offset) {
2638                         /*
2639                          * The first mbuf can fully pass
2640                          */
2641                         copylen = datalen;
2642                 } else {
2643                         /*
2644                          * The first mbuf can partially pass
2645                          */
2646                         copylen = entrybuf->cfe_pass_offset -
2647                                 entrybuf->cfe_ctl_q.q_start;
2648                 }
2649                 VERIFY(copylen <= datalen);
2650
2651                 CFIL_LOG(LOG_DEBUG,
2652                         "%llx first %llu peeked %llu pass %llu peek %llu"
2653                         "datalen %u copylen %u",
2654                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2655                         entrybuf->cfe_ctl_q.q_start,
2656                         entrybuf->cfe_peeked,
2657                         entrybuf->cfe_pass_offset,
2658                         entrybuf->cfe_peek_offset,
2659                         datalen, copylen);
2660
2661                 /*
2662                  * Data that passes has been peeked at explicitly or
2663                  * implicitly
2664                  */
2665                 if (entrybuf->cfe_ctl_q.q_start + copylen >
2666                         entrybuf->cfe_peeked)
2667                         entrybuf->cfe_peeked =
2668                                 entrybuf->cfe_ctl_q.q_start + copylen;
2669                 /*
2670                  * Stop on partial pass
2671                  */
2672                 if (copylen < datalen)
2673                         break;
2674
2675                 /* All good, move full data from ctl queue to pending queue */
2676                 cfil_queue_remove(&entrybuf->cfe_ctl_q, data, datalen);
2677
2678                 cfil_queue_enqueue(&entrybuf->cfe_pending_q, data, datalen);
2679                 if (outgoing)
2680                         OSAddAtomic64(datalen,
2681                                 &cfil_stats.cfs_pending_q_out_enqueued);
2682                 else
2683                         OSAddAtomic64(datalen,
2684                                 &cfil_stats.cfs_pending_q_in_enqueued);
2685         }
2686         CFIL_INFO_VERIFY(so->so_cfil);
2687         if (tmp != NULL)
2688                 CFIL_LOG(LOG_DEBUG,
2689                         "%llx first %llu peeked %llu pass %llu peek %llu"
2690                         "datalen %u copylen %u",
2691                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2692                         entrybuf->cfe_ctl_q.q_start,
2693                         entrybuf->cfe_peeked,
2694                         entrybuf->cfe_pass_offset,
2695                         entrybuf->cfe_peek_offset,
2696                         datalen, copylen);
2697         tmp = NULL;
2698
2699         /* Now deal with remaining data the filter wants to peek at */
2700         for (data = cfil_queue_first(&entrybuf->cfe_ctl_q),
2701                 currentoffset = entrybuf->cfe_ctl_q.q_start;
2702                 data != NULL && currentoffset < entrybuf->cfe_peek_offset;
2703                 data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
2704                 currentoffset += datalen) {
2705                 datalen = cfil_data_length(data, NULL);
2706                 tmp = data;
2707
2708                 /* We've already peeked at this mbuf */
2709                 if (currentoffset + datalen <= entrybuf->cfe_peeked)
2710                         continue;
2711                 /*
2712                  * The data in the first mbuf may have been
2713                  * partially peeked at
2714                  */
2715                 copyoffset = entrybuf->cfe_peeked - currentoffset;
2716                 VERIFY(copyoffset < datalen);
2717                 copylen = datalen - copyoffset;
2718                 VERIFY(copylen <= datalen);
2719                 /*
2720                  * Do not copy more than needed
2721                  */
2722                 if (currentoffset + copyoffset + copylen >
2723                         entrybuf->cfe_peek_offset) {
2724                         copylen = entrybuf->cfe_peek_offset -
2725                                 (currentoffset + copyoffset);
2726                 }
2727
2728                 CFIL_LOG(LOG_DEBUG,
2729                         "%llx current %llu peeked %llu pass %llu peek %llu"
2730                         "datalen %u copylen %u copyoffset %u",
2731                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2732                         currentoffset,
2733                         entrybuf->cfe_peeked,
2734                         entrybuf->cfe_pass_offset,
2735                         entrybuf->cfe_peek_offset,
2736                         datalen, copylen, copyoffset);
2737
2738                 /*
2739                  * Stop if there is nothing more to peek at
2740                  */
2741                 if (copylen == 0)
2742                         break;
2743                 /*
2744                  * Let the filter get a peek at this span of data
2745                  */
2746                 error = cfil_dispatch_data_event(so, kcunit,
2747                         outgoing, data, copyoffset, copylen);
2748                 if (error != 0) {
2749                         /* On error, leave data in ctl_q */
2750                         break;
2751                 }
2752                 entrybuf->cfe_peeked += copylen;
2753                 if (outgoing)
2754                         OSAddAtomic64(copylen,
2755                                 &cfil_stats.cfs_ctl_q_out_peeked);
2756                 else
2757                         OSAddAtomic64(copylen,
2758                                 &cfil_stats.cfs_ctl_q_in_peeked);
2759
2760                 /* Stop when data could not be fully peeked at */
2761                 if (copylen + copyoffset < datalen)
2762                         break;
2763         }
2764         CFIL_INFO_VERIFY(so->so_cfil);
2765         if (tmp != NULL)
2766                 CFIL_LOG(LOG_DEBUG,
2767                         "%llx first %llu peeked %llu pass %llu peek %llu"
2768                         "datalen %u copylen %u copyoffset %u",
2769                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2770                         currentoffset,
2771                         entrybuf->cfe_peeked,
2772                         entrybuf->cfe_pass_offset,
2773                         entrybuf->cfe_peek_offset,
2774                         datalen, copylen, copyoffset);
2775
2776         /*
2777          * Process data that has passed the filter
2778          */
2779         error = cfil_service_pending_queue(so, kcunit, outgoing);
2780         if (error != 0) {
2781                 CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
2782                         error);
2783                 goto done;
2784         }
2785
2786         /*
2787          * Dispatch disconnect events that could not be sent
2788          */
2789         if (so->so_cfil == NULL)
2790                 goto done;
2791         else if (outgoing) {
2792                 if ((so->so_cfil->cfi_flags & CFIF_SHUT_WR) &&
2793                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT))
2794                         cfil_dispatch_disconnect_event(so, kcunit, 1);
2795         } else {
2796                 if ((so->so_cfil->cfi_flags & CFIF_SHUT_RD) &&
2797                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))
2798                         cfil_dispatch_disconnect_event(so, kcunit, 0);
2799         }
2800
2801 done:
2802         CFIL_LOG(LOG_DEBUG,
2803                 "first %llu peeked %llu pass %llu peek %llu",
2804                 entrybuf->cfe_ctl_q.q_start,
2805                 entrybuf->cfe_peeked,
2806                 entrybuf->cfe_pass_offset,
2807                 entrybuf->cfe_peek_offset);
2808
2809         CFIL_INFO_VERIFY(so->so_cfil);
2810         return (error);
2811 }
2812
2813 /*
2814  * cfil_data_filter()
2815  *
2816  * Process data for a content filter installed on a socket
2817  */
2818 int
2819 cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
2820         struct mbuf *data, uint64_t datalen)
2821 {
2822         errno_t error = 0;
2823         struct cfil_entry *entry;
2824         struct cfe_buf *entrybuf;
2825
2826         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2827                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2828
2829         socket_lock_assert_owned(so);
2830
2831         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2832         if (outgoing)
2833                 entrybuf = &entry->cfe_snd;
2834         else
2835                 entrybuf = &entry->cfe_rcv;
2836
2837         /* Are we attached to the filter? */
2838         if (entry->cfe_filter == NULL) {
2839                 error = 0;
2840                 goto done;
2841         }
2842
2843         /* Dispatch to filters */
2844         cfil_queue_enqueue(&entrybuf->cfe_ctl_q, data, datalen);
2845         if (outgoing)
2846                 OSAddAtomic64(datalen,
2847                         &cfil_stats.cfs_ctl_q_out_enqueued);
2848         else
2849                 OSAddAtomic64(datalen,
2850                         &cfil_stats.cfs_ctl_q_in_enqueued);
2851
2852         error = cfil_data_service_ctl_q(so, kcunit, outgoing);
2853         if (error != 0) {
2854                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
2855                         error);
2856         }
2857         /*
2858          * We have to return EJUSTRETURN in all cases to avoid double free
2859          * by socket layer
2860          */
2861         error = EJUSTRETURN;
2862 done:
2863         CFIL_INFO_VERIFY(so->so_cfil);
2864
2865         CFIL_LOG(LOG_INFO, "return %d", error);
2866         return (error);
2867 }
2868
2869 /*
2870  * cfil_service_inject_queue() re-inject data that passed the
2871  * content filters
2872  */
2873 static int
2874 cfil_service_inject_queue(struct socket *so, int outgoing)
2875 {
2876         mbuf_t data;
2877         unsigned int datalen;
2878         int mbcnt;
2879         unsigned int copylen;
2880         errno_t error = 0;
2881         struct mbuf *copy = NULL;
2882         struct cfi_buf *cfi_buf;
2883         struct cfil_queue *inject_q;
2884         int need_rwakeup = 0;
2885
2886         if (so->so_cfil == NULL)
2887                 return (0);
2888
2889         CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
2890                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
2891
2892         socket_lock_assert_owned(so);
2893
2894         if (outgoing) {
2895                 cfi_buf = &so->so_cfil->cfi_snd;
2896                 so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
2897         } else {
2898                 cfi_buf = &so->so_cfil->cfi_rcv;
2899                 so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
2900         }
2901         inject_q = &cfi_buf->cfi_inject_q;
2902
2903         while ((data = cfil_queue_first(inject_q)) != NULL) {
2904                 datalen = cfil_data_length(data, &mbcnt);
2905
2906                 CFIL_LOG(LOG_INFO, "data %llx datalen %u",
2907                         (uint64_t)VM_KERNEL_ADDRPERM(data), datalen);
2908
2909                 /* Make a copy in case of injection error */
2910                 copy = m_copym_mode(data, 0, M_COPYALL, M_DONTWAIT,
2911                         M_COPYM_COPY_HDR);
2912                 if (copy == NULL) {
2913                         CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2914                         error = ENOMEM;
2915                         break;
2916                 }
2917
2918                 if ((copylen = m_length(copy)) != datalen)
2919                         panic("%s so %p copylen %d != datalen %d",
2920                                 __func__, so, copylen, datalen);
2921
2922                 if (outgoing) {
2923                         socket_unlock(so, 0);
2924
2925                         /*
2926                          * Set both DONTWAIT and NBIO flags are we really
2927                          * do not want to block
2928                          */
2929                         error = sosend(so, NULL, NULL,
2930                                         copy, NULL,
2931                                         MSG_SKIPCFIL | MSG_DONTWAIT | MSG_NBIO);
2932
2933                         socket_lock(so, 0);
2934
2935                         if (error != 0) {
2936                                 CFIL_LOG(LOG_ERR, "sosend() failed %d",
2937                                         error);
2938                         }
2939                 } else {
2940                         copy->m_flags |= M_SKIPCFIL;
2941
2942                         /*
2943                          * NOTE:
2944                          * This work only because we support plain TCP
2945                          * For UDP, RAWIP, MPTCP and message TCP we'll
2946                          * need to call the appropriate sbappendxxx()
2947                          * of fix sock_inject_data_in()
2948                          */
2949                         if (sbappendstream(&so->so_rcv, copy))
2950                                 need_rwakeup = 1;
2951                 }
2952
2953                 /* Need to reassess if filter is still attached after unlock */
2954                 if (so->so_cfil == NULL) {
2955                         CFIL_LOG(LOG_ERR, "so %llx cfil detached",
2956                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
2957                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_detached);
2958                         error = 0;
2959                         break;
2960                 }
2961                 if (error != 0)
2962                         break;
2963
2964                 /* Injection successful */
2965                 cfil_queue_remove(inject_q, data, datalen);
2966                 mbuf_freem(data);
2967
2968                 cfi_buf->cfi_pending_first += datalen;
2969                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
2970                 cfil_info_buf_verify(cfi_buf);
2971
2972                 if (outgoing)
2973                         OSAddAtomic64(datalen,
2974                                 &cfil_stats.cfs_inject_q_out_passed);
2975                 else
2976                         OSAddAtomic64(datalen,
2977                                 &cfil_stats.cfs_inject_q_in_passed);
2978         }
2979
2980         /* A single wakeup is for several packets is more efficient */
2981         if (need_rwakeup)
2982                 sorwakeup(so);
2983
2984         if (error != 0 && so->so_cfil) {
2985                 if (error == ENOBUFS)
2986                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
2987                 if (error == ENOMEM)
2988                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
2989
2990                 if (outgoing) {
2991                         so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_OUT;
2992                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
2993                 } else {
2994                         so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_IN;
2995                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
2996                 }
2997         }
2998
2999         /*
3000          * Notify
3001          */
3002         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_SHUT_WR)) {
3003                 cfil_sock_notify_shutdown(so, SHUT_WR);
3004                 if (cfil_sock_data_pending(&so->so_snd) == 0)
3005                         soshutdownlock_final(so, SHUT_WR);
3006         }
3007         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
3008                 if (cfil_filters_attached(so) == 0) {
3009                         CFIL_LOG(LOG_INFO, "so %llx waking",
3010                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
3011                         wakeup((caddr_t)&so->so_cfil);
3012                 }
3013         }
3014
3015         CFIL_INFO_VERIFY(so->so_cfil);
3016
3017         return (error);
3018 }
3019
3020 static int
3021 cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
3022 {
3023         uint64_t passlen, curlen;
3024         mbuf_t data;
3025         unsigned int datalen;
3026         errno_t error = 0;
3027         struct cfil_entry *entry;
3028         struct cfe_buf *entrybuf;
3029         struct cfil_queue *pending_q;
3030
3031         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3032                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3033
3034         socket_lock_assert_owned(so);
3035
3036         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3037         if (outgoing)
3038                 entrybuf = &entry->cfe_snd;
3039         else
3040                 entrybuf = &entry->cfe_rcv;
3041
3042         pending_q = &entrybuf->cfe_pending_q;
3043
3044         passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
3045
3046         /*
3047          * Locate the chunks of data that we can pass to the next filter
3048          * A data chunk must be on mbuf boundaries
3049          */
3050         curlen = 0;
3051         while ((data = cfil_queue_first(pending_q)) != NULL) {
3052                 datalen = cfil_data_length(data, NULL);
3053
3054                 CFIL_LOG(LOG_INFO,
3055                         "data %llx datalen %u passlen %llu curlen %llu",
3056                         (uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
3057                         passlen, curlen);
3058
3059                 if (curlen + datalen > passlen)
3060                         break;
3061
3062                 cfil_queue_remove(pending_q, data, datalen);
3063
3064                 curlen += datalen;
3065
3066                 for (kcunit += 1;
3067                         kcunit <= MAX_CONTENT_FILTER;
3068                         kcunit++) {
3069                         error = cfil_data_filter(so, kcunit, outgoing,
3070                                 data, datalen);
3071                         /* 0 means passed so we can continue */
3072                         if (error != 0)
3073                                 break;
3074                 }
3075                 /* When data has passed all filters, re-inject */
3076                 if (error == 0) {
3077                         if (outgoing) {
3078                                 cfil_queue_enqueue(
3079                                         &so->so_cfil->cfi_snd.cfi_inject_q,
3080                                         data, datalen);
3081                                 OSAddAtomic64(datalen,
3082                                         &cfil_stats.cfs_inject_q_out_enqueued);
3083                         } else {
3084                                 cfil_queue_enqueue(
3085                                         &so->so_cfil->cfi_rcv.cfi_inject_q,
3086                                         data, datalen);
3087                                 OSAddAtomic64(datalen,
3088                                         &cfil_stats.cfs_inject_q_in_enqueued);
3089                         }
3090                 }
3091         }
3092
3093         CFIL_INFO_VERIFY(so->so_cfil);
3094
3095         return (error);
3096 }
3097
3098 int
3099 cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
3100         uint64_t pass_offset, uint64_t peek_offset)
3101 {
3102         errno_t error = 0;
3103         struct cfil_entry *entry = NULL;
3104         struct cfe_buf *entrybuf;
3105         int updated = 0;
3106
3107         CFIL_LOG(LOG_INFO, "pass %llu peek %llu", pass_offset, peek_offset);
3108
3109         socket_lock_assert_owned(so);
3110
3111         if (so->so_cfil == NULL) {
3112                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
3113                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3114                 error = 0;
3115                 goto done;
3116         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
3117                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3118                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3119                 error = EPIPE;
3120                 goto done;
3121         }
3122
3123         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3124         if (outgoing)
3125                 entrybuf = &entry->cfe_snd;
3126         else
3127                 entrybuf = &entry->cfe_rcv;
3128
3129         /* Record updated offsets for this content filter */
3130         if (pass_offset > entrybuf->cfe_pass_offset) {
3131                 entrybuf->cfe_pass_offset = pass_offset;
3132
3133                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
3134                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
3135                 updated = 1;
3136         } else {
3137                 CFIL_LOG(LOG_INFO, "pass_offset %llu <= cfe_pass_offset %llu",
3138                         pass_offset, entrybuf->cfe_pass_offset);
3139         }
3140         /* Filter does not want or need to see data that's allowed to pass */
3141         if (peek_offset > entrybuf->cfe_pass_offset &&
3142                 peek_offset > entrybuf->cfe_peek_offset) {
3143                 entrybuf->cfe_peek_offset = peek_offset;
3144                 updated = 1;
3145         }
3146         /* Nothing to do */
3147         if (updated == 0)
3148                 goto done;
3149
3150         /* Move data held in control queue to pending queue if needed */
3151         error = cfil_data_service_ctl_q(so, kcunit, outgoing);
3152         if (error != 0) {
3153                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
3154                         error);
3155                 goto done;
3156         }
3157         error = EJUSTRETURN;
3158
3159 done:
3160         /*
3161          * The filter is effectively detached when pass all from both sides
3162          * or when the socket is closed and no more data is waiting
3163          * to be delivered to the filter
3164          */
3165         if (entry != NULL &&
3166             ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
3167             entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
3168             ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3169             cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
3170             cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
3171                 entry->cfe_flags |= CFEF_CFIL_DETACHED;
3172                 CFIL_LOG(LOG_INFO, "so %llx detached %u",
3173                         (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3174                 if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3175                     cfil_filters_attached(so) == 0) {
3176                         CFIL_LOG(LOG_INFO, "so %llx waking",
3177                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
3178                         wakeup((caddr_t)&so->so_cfil);
3179                 }
3180         }
3181         CFIL_INFO_VERIFY(so->so_cfil);
3182         CFIL_LOG(LOG_INFO, "return %d", error);
3183         return (error);
3184 }
3185
3186 /*
3187  * Update pass offset for socket when no data is pending
3188  */
3189 static int
3190 cfil_set_socket_pass_offset(struct socket *so, int outgoing)
3191 {
3192         struct cfi_buf *cfi_buf;
3193         struct cfil_entry *entry;
3194         struct cfe_buf *entrybuf;
3195         uint32_t kcunit;
3196         uint64_t pass_offset = 0;
3197
3198         if (so->so_cfil == NULL)
3199                 return (0);
3200
3201         CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
3202                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3203
3204         socket_lock_assert_owned(so);
3205
3206         if (outgoing)
3207                 cfi_buf = &so->so_cfil->cfi_snd;
3208         else
3209                 cfi_buf = &so->so_cfil->cfi_rcv;
3210
3211         if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
3212                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3213                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3214
3215                         /* Are we attached to a filter? */
3216                         if (entry->cfe_filter == NULL)
3217                                 continue;
3218
3219                         if (outgoing)
3220                                 entrybuf = &entry->cfe_snd;
3221                         else
3222                                 entrybuf = &entry->cfe_rcv;
3223
3224                         if (pass_offset == 0 ||
3225                             entrybuf->cfe_pass_offset < pass_offset)
3226                                 pass_offset = entrybuf->cfe_pass_offset;
3227                 }
3228                 cfi_buf->cfi_pass_offset = pass_offset;
3229         }
3230
3231         return (0);
3232 }
3233
3234 int
3235 cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
3236         uint64_t pass_offset, uint64_t peek_offset)
3237 {
3238         errno_t error = 0;
3239
3240         CFIL_LOG(LOG_INFO, "");
3241
3242         socket_lock_assert_owned(so);
3243
3244         error = cfil_acquire_sockbuf(so, outgoing);
3245         if (error != 0) {
3246                 CFIL_LOG(LOG_INFO, "so %llx %s dropped",
3247                         (uint64_t)VM_KERNEL_ADDRPERM(so),
3248                         outgoing ? "out" : "in");
3249                 goto release;
3250         }
3251
3252         error = cfil_update_data_offsets(so, kcunit, outgoing,
3253                 pass_offset, peek_offset);
3254
3255         cfil_service_inject_queue(so, outgoing);
3256
3257         cfil_set_socket_pass_offset(so, outgoing);
3258 release:
3259         CFIL_INFO_VERIFY(so->so_cfil);
3260         cfil_release_sockbuf(so, outgoing);
3261
3262         return (error);
3263 }
3264
3265
3266 static void
3267 cfil_flush_queues(struct socket *so)
3268 {
3269         struct cfil_entry *entry;
3270         int kcunit;
3271         uint64_t drained;
3272
3273         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3274                 goto done;
3275
3276         socket_lock_assert_owned(so);
3277
3278         /*
3279          * Flush the output queues and ignore errors as long as
3280          * we are attached
3281          */
3282         (void) cfil_acquire_sockbuf(so, 1);
3283         if (so->so_cfil != NULL) {
3284                 drained = 0;
3285                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3286                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3287
3288                         drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
3289                         drained += cfil_queue_drain(
3290                             &entry->cfe_snd.cfe_pending_q);
3291                 }
3292                 drained += cfil_queue_drain(&so->so_cfil->cfi_snd.cfi_inject_q);
3293                 if (drained) {
3294                         if (so->so_cfil->cfi_flags & CFIF_DROP)
3295                                 OSIncrementAtomic(
3296                                         &cfil_stats.cfs_flush_out_drop);
3297                         else
3298                                 OSIncrementAtomic(
3299                                         &cfil_stats.cfs_flush_out_close);
3300                 }
3301         }
3302         cfil_release_sockbuf(so, 1);
3303
3304         /*
3305          * Flush the input queues
3306          */
3307         (void) cfil_acquire_sockbuf(so, 0);
3308         if (so->so_cfil != NULL) {
3309                 drained = 0;
3310                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3311                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3312
3313                                 drained += cfil_queue_drain(
3314                                         &entry->cfe_rcv.cfe_ctl_q);
3315                                 drained += cfil_queue_drain(
3316                                         &entry->cfe_rcv.cfe_pending_q);
3317                 }
3318                 drained += cfil_queue_drain(&so->so_cfil->cfi_rcv.cfi_inject_q);
3319                 if (drained) {
3320                         if (so->so_cfil->cfi_flags & CFIF_DROP)
3321                                 OSIncrementAtomic(
3322                                         &cfil_stats.cfs_flush_in_drop);
3323                         else
3324                                 OSIncrementAtomic(
3325                                         &cfil_stats.cfs_flush_in_close);
3326                 }
3327         }
3328         cfil_release_sockbuf(so, 0);
3329 done:
3330         CFIL_INFO_VERIFY(so->so_cfil);
3331 }
3332
3333 int
3334 cfil_action_drop(struct socket *so, uint32_t kcunit)
3335 {
3336         errno_t error = 0;
3337         struct cfil_entry *entry;
3338         struct proc *p;
3339
3340         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3341                 goto done;
3342
3343         socket_lock_assert_owned(so);
3344
3345         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3346
3347         /* Are we attached to the filter? */
3348         if (entry->cfe_filter == NULL)
3349                 goto done;
3350
3351         so->so_cfil->cfi_flags |= CFIF_DROP;
3352
3353         p = current_proc();
3354
3355         /*
3356          * Force the socket to be marked defunct
3357          * (forcing fixed along with rdar://19391339)
3358          */
3359         error = sosetdefunct(p, so,
3360             SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
3361             FALSE);
3362
3363         /* Flush the socket buffer and disconnect */
3364         if (error == 0)
3365                 error = sodefunct(p, so,
3366                     SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
3367
3368         /* The filter is done, mark as detached */
3369         entry->cfe_flags |= CFEF_CFIL_DETACHED;
3370         CFIL_LOG(LOG_INFO, "so %llx detached %u",
3371                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3372
3373         /* Pending data needs to go */
3374         cfil_flush_queues(so);
3375
3376         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
3377                 if (cfil_filters_attached(so) == 0) {
3378                         CFIL_LOG(LOG_INFO, "so %llx waking",
3379                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
3380                         wakeup((caddr_t)&so->so_cfil);
3381                 }
3382         }
3383 done:
3384         return (error);
3385 }
3386
3387 int
3388 cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
3389 {
3390         errno_t error = 0;
3391
3392         cfil_rw_lock_exclusive(&cfil_lck_rw);
3393
3394         bool cfil_attached = false;
3395         struct cfil_msg_bless_client *blessmsg = (struct cfil_msg_bless_client *)msghdr;
3396         struct socket *so = cfil_socket_from_client_uuid(blessmsg->cfb_client_uuid, &cfil_attached);
3397         if (so == NULL) {
3398                 error = ENOENT;
3399         } else {
3400                 // The client gets a pass automatically
3401                 socket_lock(so, 1);
3402                 if (cfil_attached) {
3403                         (void)cfil_action_data_pass(so, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
3404                         (void)cfil_action_data_pass(so, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
3405                 } else {
3406                         so->so_flags1 |= SOF1_CONTENT_FILTER_SKIP;
3407                 }
3408                 socket_unlock(so, 1);
3409         }
3410
3411         cfil_rw_unlock_exclusive(&cfil_lck_rw);
3412
3413         return (error);
3414 }
3415
3416 static int
3417 cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
3418 {
3419         struct cfil_entry *entry;
3420         struct cfe_buf *entrybuf;
3421         uint32_t kcunit;
3422
3423         CFIL_LOG(LOG_INFO, "so %llx outgoing %d datalen %u",
3424                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
3425
3426         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3427                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
3428
3429                 /* Are we attached to the filter? */
3430                 if (entry->cfe_filter == NULL)
3431                         continue;
3432
3433                 if (outgoing)
3434                         entrybuf = &entry->cfe_snd;
3435                 else
3436                         entrybuf = &entry->cfe_rcv;
3437
3438                 entrybuf->cfe_ctl_q.q_start += datalen;
3439                 entrybuf->cfe_pass_offset = entrybuf->cfe_ctl_q.q_start;
3440                 entrybuf->cfe_peeked = entrybuf->cfe_ctl_q.q_start;
3441                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
3442                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
3443
3444                 entrybuf->cfe_ctl_q.q_end += datalen;
3445
3446                 entrybuf->cfe_pending_q.q_start += datalen;
3447                 entrybuf->cfe_pending_q.q_end += datalen;
3448         }
3449         CFIL_INFO_VERIFY(so->so_cfil);
3450         return (0);
3451 }
3452
3453 int
3454 cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to,
3455                 struct mbuf *data, struct mbuf *control, uint32_t flags)
3456 {
3457 #pragma unused(to, control, flags)
3458         errno_t error = 0;
3459         unsigned int datalen;
3460         int mbcnt;
3461         int kcunit;
3462         struct cfi_buf *cfi_buf;
3463
3464         if (so->so_cfil == NULL) {
3465                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
3466                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3467                 error = 0;
3468                 goto done;
3469         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
3470                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3471                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3472                 error = EPIPE;
3473                 goto done;
3474         }
3475
3476         datalen = cfil_data_length(data, &mbcnt);
3477
3478         CFIL_LOG(LOG_INFO, "so %llx %s m %llx len %u flags 0x%x nextpkt %llx",
3479                 (uint64_t)VM_KERNEL_ADDRPERM(so),
3480                 outgoing ? "out" : "in",
3481                 (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
3482                 (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt));
3483
3484         if (outgoing)
3485                 cfi_buf = &so->so_cfil->cfi_snd;
3486         else
3487                 cfi_buf = &so->so_cfil->cfi_rcv;
3488
3489         cfi_buf->cfi_pending_last += datalen;
3490         cfi_buf->cfi_pending_mbcnt += mbcnt;
3491         cfil_info_buf_verify(cfi_buf);
3492
3493         CFIL_LOG(LOG_INFO, "so %llx cfi_pending_last %llu cfi_pass_offset %llu",
3494                 (uint64_t)VM_KERNEL_ADDRPERM(so),
3495                 cfi_buf->cfi_pending_last,
3496                 cfi_buf->cfi_pass_offset);
3497
3498         /* Fast path when below pass offset */
3499         if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
3500                 cfil_update_entry_offsets(so, outgoing, datalen);
3501         } else {
3502                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3503                         error = cfil_data_filter(so, kcunit, outgoing, data,
3504                                 datalen);
3505                         /* 0 means passed so continue with next filter */
3506                         if (error != 0)
3507                                 break;
3508                 }
3509         }
3510
3511         /* Move cursor if no filter claimed the data */
3512         if (error == 0) {
3513                 cfi_buf->cfi_pending_first += datalen;
3514                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
3515                 cfil_info_buf_verify(cfi_buf);
3516         }
3517 done:
3518         CFIL_INFO_VERIFY(so->so_cfil);
3519
3520         return (error);
3521 }
3522
3523 /*
3524  * Callback from socket layer sosendxxx()
3525  */
3526 int
3527 cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
3528                 struct mbuf *data, struct mbuf *control, uint32_t flags)
3529 {
3530         int error = 0;
3531
3532         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3533                 return (0);
3534
3535         socket_lock_assert_owned(so);
3536
3537         if (so->so_cfil->cfi_flags & CFIF_DROP) {
3538                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3539                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3540                 return (EPIPE);
3541         }
3542         if (control != NULL) {
3543                 CFIL_LOG(LOG_ERR, "so %llx control",
3544                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3545                 OSIncrementAtomic(&cfil_stats.cfs_data_out_control);
3546         }
3547         if ((flags & MSG_OOB)) {
3548                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3549                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3550                 OSIncrementAtomic(&cfil_stats.cfs_data_out_oob);
3551         }
3552         if ((so->so_snd.sb_flags & SB_LOCK) == 0)
3553                 panic("so %p SB_LOCK not set", so);
3554
3555         if (so->so_snd.sb_cfil_thread != NULL)
3556                 panic("%s sb_cfil_thread %p not NULL", __func__,
3557                         so->so_snd.sb_cfil_thread);
3558
3559         error = cfil_data_common(so, 1, to, data, control, flags);
3560
3561         return (error);
3562 }
3563
3564 /*
3565  * Callback from socket layer sbappendxxx()
3566  */
3567 int
3568 cfil_sock_data_in(struct socket *so, struct sockaddr *from,
3569         struct mbuf *data, struct mbuf *control, uint32_t flags)
3570 {
3571         int error = 0;
3572
3573         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3574                 return (0);
3575
3576         socket_lock_assert_owned(so);
3577
3578         if (so->so_cfil->cfi_flags & CFIF_DROP) {
3579                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3580                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3581                 return (EPIPE);
3582         }
3583         if (control != NULL) {
3584                 CFIL_LOG(LOG_ERR, "so %llx control",
3585                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3586                 OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
3587         }
3588         if (data->m_type == MT_OOBDATA) {
3589                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3590                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3591                 OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
3592         }
3593         error = cfil_data_common(so, 0, from, data, control, flags);
3594
3595         return (error);
3596 }
3597
3598 /*
3599  * Callback from socket layer soshutdownxxx()
3600  *
3601  * We may delay the shutdown write if there's outgoing data in process.
3602  *
3603  * There is no point in delaying the shutdown read because the process
3604  * indicated that it does not want to read anymore data.
3605  */
3606 int
3607 cfil_sock_shutdown(struct socket *so, int *how)
3608 {
3609         int error = 0;
3610
3611         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3612                 goto done;
3613
3614         socket_lock_assert_owned(so);
3615
3616         CFIL_LOG(LOG_INFO, "so %llx how %d",
3617                 (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
3618
3619         /*
3620          * Check the state of the socket before the content filter
3621          */
3622         if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
3623                 /* read already shut down */
3624                 error = ENOTCONN;
3625                 goto done;
3626         }
3627         if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
3628                 /* write already shut down */
3629                 error = ENOTCONN;
3630                 goto done;
3631         }
3632
3633         if ((so->so_cfil->cfi_flags & CFIF_DROP) != 0) {
3634                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3635                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3636                 goto done;
3637         }
3638
3639         /*
3640          * shutdown read: SHUT_RD or SHUT_RDWR
3641          */
3642         if (*how != SHUT_WR) {
3643                 if (so->so_cfil->cfi_flags & CFIF_SHUT_RD) {
3644                         error = ENOTCONN;
3645                         goto done;
3646                 }
3647                 so->so_cfil->cfi_flags |= CFIF_SHUT_RD;
3648                 cfil_sock_notify_shutdown(so, SHUT_RD);
3649         }
3650         /*
3651          * shutdown write: SHUT_WR or SHUT_RDWR
3652          */
3653         if (*how != SHUT_RD) {
3654                 if (so->so_cfil->cfi_flags & CFIF_SHUT_WR) {
3655                         error = ENOTCONN;
3656                         goto done;
3657                 }
3658                 so->so_cfil->cfi_flags |= CFIF_SHUT_WR;
3659                 cfil_sock_notify_shutdown(so, SHUT_WR);
3660                 /*
3661                  * When outgoing data is pending, we delay the shutdown at the
3662                  * protocol level until the content filters give the final
3663                  * verdict on the pending data.
3664                  */
3665                 if (cfil_sock_data_pending(&so->so_snd) != 0) {
3666                         /*
3667                          * When shutting down the read and write sides at once
3668                          * we can proceed to the final shutdown of the read
3669                          * side. Otherwise, we just return.
3670                          */
3671                         if (*how == SHUT_WR) {
3672                                 error = EJUSTRETURN;
3673                         } else if (*how == SHUT_RDWR) {
3674                                 *how = SHUT_RD;
3675                         }
3676                 }
3677         }
3678 done:
3679         return (error);
3680 }
3681
3682 /*
3683  * This is called when the socket is closed and there is no more
3684  * opportunity for filtering
3685  */
3686 void
3687 cfil_sock_is_closed(struct socket *so)
3688 {
3689         errno_t error = 0;
3690         int kcunit;
3691
3692         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3693                 return;
3694
3695         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3696
3697         socket_lock_assert_owned(so);
3698
3699         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3700                 /* Let the filters know of the closing */
3701                 error = cfil_dispatch_closed_event(so, kcunit);
3702         }
3703
3704         /* Last chance to push passed data out */
3705         error = cfil_acquire_sockbuf(so, 1);
3706         if (error == 0)
3707                 cfil_service_inject_queue(so, 1);
3708         cfil_release_sockbuf(so, 1);
3709
3710         so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
3711
3712         /* Pending data needs to go */
3713         cfil_flush_queues(so);
3714
3715         CFIL_INFO_VERIFY(so->so_cfil);
3716 }
3717
3718 /*
3719  * This is called when the socket is disconnected so let the filters
3720  * know about the disconnection and that no more data will come
3721  *
3722  * The how parameter has the same values as soshutown()
3723  */
3724 void
3725 cfil_sock_notify_shutdown(struct socket *so, int how)
3726 {
3727         errno_t error = 0;
3728         int kcunit;
3729
3730         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3731                 return;
3732
3733         CFIL_LOG(LOG_INFO, "so %llx how %d",
3734                 (uint64_t)VM_KERNEL_ADDRPERM(so), how);
3735
3736         socket_lock_assert_owned(so);
3737
3738         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3739                 /* Disconnect incoming side */
3740                 if (how != SHUT_WR)
3741                         error = cfil_dispatch_disconnect_event(so, kcunit, 0);
3742                 /* Disconnect outgoing side */
3743                 if (how != SHUT_RD)
3744                         error = cfil_dispatch_disconnect_event(so, kcunit, 1);
3745         }
3746 }
3747
3748 static int
3749 cfil_filters_attached(struct socket *so)
3750 {
3751         struct cfil_entry *entry;
3752         uint32_t kcunit;
3753         int attached = 0;
3754
3755         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3756                 return (0);
3757
3758         socket_lock_assert_owned(so);
3759
3760         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3761                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
3762
3763                 /* Are we attached to the filter? */
3764                 if (entry->cfe_filter == NULL)
3765                         continue;
3766                 if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
3767                         continue;
3768                 if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0)
3769                         continue;
3770                 attached = 1;
3771                 break;
3772         }
3773
3774         return (attached);
3775 }
3776
3777 /*
3778  * This is called when the socket is closed and we are waiting for
3779  * the filters to gives the final pass or drop
3780  */
3781 void
3782 cfil_sock_close_wait(struct socket *so)
3783 {
3784         lck_mtx_t *mutex_held;
3785         struct timespec ts;
3786         int error;
3787
3788         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3789                 return;
3790
3791         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3792
3793         if (so->so_proto->pr_getlock != NULL)
3794                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
3795         else
3796                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3797         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
3798
3799         while (cfil_filters_attached(so)) {
3800                 /*
3801                  * Notify the filters we are going away so they can detach
3802                  */
3803                 cfil_sock_notify_shutdown(so, SHUT_RDWR);
3804
3805                 /*
3806                  * Make sure we need to wait after the filter are notified
3807                  * of the disconnection
3808                  */
3809                 if (cfil_filters_attached(so) == 0)
3810                         break;
3811
3812                 CFIL_LOG(LOG_INFO, "so %llx waiting",
3813                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3814
3815                 ts.tv_sec = cfil_close_wait_timeout / 1000;
3816                 ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
3817                         NSEC_PER_USEC * 1000;
3818
3819                 OSIncrementAtomic(&cfil_stats.cfs_close_wait);
3820                 so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
3821                 error = msleep((caddr_t)&so->so_cfil, mutex_held,
3822                         PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
3823                 so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
3824
3825                 CFIL_LOG(LOG_NOTICE, "so %llx timed out %d",
3826                         (uint64_t)VM_KERNEL_ADDRPERM(so), (error != 0));
3827
3828                 /*
3829                  * Force close in case of timeout
3830                  */
3831                 if (error != 0) {
3832                         OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
3833                         break;
3834                 }
3835         }
3836
3837 }
3838
3839 /*
3840  * Returns the size of the data held by the content filter by using
3841  */
3842 int32_t
3843 cfil_sock_data_pending(struct sockbuf *sb)
3844 {
3845         struct socket *so = sb->sb_so;
3846         uint64_t pending = 0;
3847
3848         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
3849                 struct cfi_buf *cfi_buf;
3850
3851                 socket_lock_assert_owned(so);
3852
3853                 if ((sb->sb_flags & SB_RECV) == 0)
3854                         cfi_buf = &so->so_cfil->cfi_snd;
3855                 else
3856                         cfi_buf = &so->so_cfil->cfi_rcv;
3857
3858                 pending = cfi_buf->cfi_pending_last -
3859                         cfi_buf->cfi_pending_first;
3860
3861                 /*
3862                  * If we are limited by the "chars of mbufs used" roughly
3863                  * adjust so we won't overcommit
3864                  */
3865                 if (pending > (uint64_t)cfi_buf->cfi_pending_mbcnt)
3866                         pending = cfi_buf->cfi_pending_mbcnt;
3867         }
3868
3869         VERIFY(pending < INT32_MAX);
3870
3871         return (int32_t)(pending);
3872 }
3873
3874 /*
3875  * Return the socket buffer space used by data being held by content filters
3876  * so processes won't clog the socket buffer
3877  */
3878 int32_t
3879 cfil_sock_data_space(struct sockbuf *sb)
3880 {
3881         struct socket *so = sb->sb_so;
3882         uint64_t pending = 0;
3883
3884         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
3885                 so->so_snd.sb_cfil_thread != current_thread()) {
3886                 struct cfi_buf *cfi_buf;
3887
3888                 socket_lock_assert_owned(so);
3889
3890                 if ((sb->sb_flags & SB_RECV) == 0)
3891                         cfi_buf = &so->so_cfil->cfi_snd;
3892                 else
3893                         cfi_buf = &so->so_cfil->cfi_rcv;
3894
3895                 pending = cfi_buf->cfi_pending_last -
3896                         cfi_buf->cfi_pending_first;
3897
3898                 /*
3899                  * If we are limited by the "chars of mbufs used" roughly
3900                  * adjust so we won't overcommit
3901                  */
3902                 if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending)
3903                         pending = cfi_buf->cfi_pending_mbcnt;
3904         }
3905
3906         VERIFY(pending < INT32_MAX);
3907
3908         return (int32_t)(pending);
3909 }
3910
3911 /*
3912  * A callback from the socket and protocol layer when data becomes
3913  * available in the socket buffer to give a chance for the content filter
3914  * to re-inject data that was held back
3915  */
3916 void
3917 cfil_sock_buf_update(struct sockbuf *sb)
3918 {
3919         int outgoing;
3920         int error;
3921         struct socket *so = sb->sb_so;
3922
3923         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3924                 return;
3925
3926         if (!cfil_sbtrim)
3927                 return;
3928
3929         socket_lock_assert_owned(so);
3930
3931         if ((sb->sb_flags & SB_RECV) == 0) {
3932                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0)
3933                         return;
3934                 outgoing = 1;
3935                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
3936         } else {
3937                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_IN) == 0)
3938                         return;
3939                 outgoing = 0;
3940                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
3941         }
3942
3943         CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
3944                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3945
3946         error = cfil_acquire_sockbuf(so, outgoing);
3947         if (error == 0)
3948                 cfil_service_inject_queue(so, outgoing);
3949         cfil_release_sockbuf(so, outgoing);
3950 }
3951
3952 int
3953 sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
3954         struct sysctl_req *req)
3955 {
3956 #pragma unused(oidp, arg1, arg2)
3957         int error = 0;
3958         size_t len = 0;
3959         u_int32_t i;
3960
3961         /* Read only  */
3962         if (req->newptr != USER_ADDR_NULL)
3963                 return (EPERM);
3964
3965         cfil_rw_lock_shared(&cfil_lck_rw);
3966
3967         for (i = 0; content_filters != NULL && i < MAX_CONTENT_FILTER; i++) {
3968                 struct cfil_filter_stat filter_stat;
3969                 struct content_filter *cfc = content_filters[i];
3970
3971                 if (cfc == NULL)
3972                         continue;
3973
3974                 /* If just asking for the size */
3975                 if (req->oldptr == USER_ADDR_NULL) {
3976                         len += sizeof(struct cfil_filter_stat);
3977                         continue;
3978                 }
3979
3980                 bzero(&filter_stat, sizeof(struct cfil_filter_stat));
3981                 filter_stat.cfs_len = sizeof(struct cfil_filter_stat);
3982                 filter_stat.cfs_filter_id = cfc->cf_kcunit;
3983                 filter_stat.cfs_flags = cfc->cf_flags;
3984                 filter_stat.cfs_sock_count = cfc->cf_sock_count;
3985                 filter_stat.cfs_necp_control_unit = cfc->cf_necp_control_unit;
3986
3987                 error = SYSCTL_OUT(req, &filter_stat,
3988                         sizeof (struct cfil_filter_stat));
3989                 if (error != 0)
3990                         break;
3991         }
3992         /* If just asking for the size */
3993         if (req->oldptr == USER_ADDR_NULL)
3994                 req->oldidx = len;
3995
3996         cfil_rw_unlock_shared(&cfil_lck_rw);
3997
3998         return (error);
3999 }
4000
4001 static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
4002         struct sysctl_req *req)
4003 {
4004 #pragma unused(oidp, arg1, arg2)
4005         int error = 0;
4006         u_int32_t i;
4007         struct cfil_info *cfi;
4008
4009         /* Read only  */
4010         if (req->newptr != USER_ADDR_NULL)
4011                 return (EPERM);
4012
4013         cfil_rw_lock_shared(&cfil_lck_rw);
4014
4015         /*
4016          * If just asking for the size,
4017          */
4018         if (req->oldptr == USER_ADDR_NULL) {
4019                 req->oldidx = cfil_sock_attached_count *
4020                         sizeof(struct cfil_sock_stat);
4021                 /* Bump the length in case new sockets gets attached */
4022                 req->oldidx += req->oldidx >> 3;
4023                 goto done;
4024         }
4025
4026         TAILQ_FOREACH(cfi, &cfil_sock_head, cfi_link) {
4027                 struct cfil_entry *entry;
4028                 struct cfil_sock_stat stat;
4029                 struct socket *so = cfi->cfi_so;
4030
4031                 bzero(&stat, sizeof(struct cfil_sock_stat));
4032                 stat.cfs_len = sizeof(struct cfil_sock_stat);
4033                 stat.cfs_sock_id = cfi->cfi_sock_id;
4034                 stat.cfs_flags = cfi->cfi_flags;
4035
4036                 if (so != NULL) {
4037                         stat.cfs_pid = so->last_pid;
4038                         memcpy(stat.cfs_uuid, so->last_uuid,
4039                                 sizeof(uuid_t));
4040                         if (so->so_flags & SOF_DELEGATED) {
4041                                 stat.cfs_e_pid = so->e_pid;
4042                                 memcpy(stat.cfs_e_uuid, so->e_uuid,
4043                                         sizeof(uuid_t));
4044                         } else {
4045                                 stat.cfs_e_pid = so->last_pid;
4046                                 memcpy(stat.cfs_e_uuid, so->last_uuid,
4047                                         sizeof(uuid_t));
4048                         }
4049                 }
4050
4051                 stat.cfs_snd.cbs_pending_first =
4052                         cfi->cfi_snd.cfi_pending_first;
4053                 stat.cfs_snd.cbs_pending_last =
4054                         cfi->cfi_snd.cfi_pending_last;
4055                 stat.cfs_snd.cbs_inject_q_len =
4056                         cfil_queue_len(&cfi->cfi_snd.cfi_inject_q);
4057                 stat.cfs_snd.cbs_pass_offset =
4058                         cfi->cfi_snd.cfi_pass_offset;
4059
4060                 stat.cfs_rcv.cbs_pending_first =
4061                         cfi->cfi_rcv.cfi_pending_first;
4062                 stat.cfs_rcv.cbs_pending_last =
4063                         cfi->cfi_rcv.cfi_pending_last;
4064                 stat.cfs_rcv.cbs_inject_q_len =
4065                         cfil_queue_len(&cfi->cfi_rcv.cfi_inject_q);
4066                 stat.cfs_rcv.cbs_pass_offset =
4067                         cfi->cfi_rcv.cfi_pass_offset;
4068
4069                 for (i = 0; i < MAX_CONTENT_FILTER; i++) {
4070                         struct cfil_entry_stat *estat;
4071                         struct cfe_buf *ebuf;
4072                         struct cfe_buf_stat *sbuf;
4073
4074                         entry = &cfi->cfi_entries[i];
4075
4076                         estat = &stat.ces_entries[i];
4077
4078                         estat->ces_len = sizeof(struct cfil_entry_stat);
4079                         estat->ces_filter_id = entry->cfe_filter ?
4080                                 entry->cfe_filter->cf_kcunit : 0;
4081                         estat->ces_flags = entry->cfe_flags;
4082                         estat->ces_necp_control_unit =
4083                                 entry->cfe_necp_control_unit;
4084
4085                         estat->ces_last_event.tv_sec =
4086                                 (int64_t)entry->cfe_last_event.tv_sec;
4087                         estat->ces_last_event.tv_usec =
4088                                 (int64_t)entry->cfe_last_event.tv_usec;
4089
4090                         estat->ces_last_action.tv_sec =
4091                                 (int64_t)entry->cfe_last_action.tv_sec;
4092                         estat->ces_last_action.tv_usec =
4093                                 (int64_t)entry->cfe_last_action.tv_usec;
4094
4095                         ebuf = &entry->cfe_snd;
4096                         sbuf = &estat->ces_snd;
4097                         sbuf->cbs_pending_first =
4098                                 cfil_queue_offset_first(&ebuf->cfe_pending_q);
4099                         sbuf->cbs_pending_last =
4100                                 cfil_queue_offset_last(&ebuf->cfe_pending_q);
4101                         sbuf->cbs_ctl_first =
4102                                 cfil_queue_offset_first(&ebuf->cfe_ctl_q);
4103                         sbuf->cbs_ctl_last =
4104                                 cfil_queue_offset_last(&ebuf->cfe_ctl_q);
4105                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
4106                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
4107                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
4108
4109                         ebuf = &entry->cfe_rcv;
4110                         sbuf = &estat->ces_rcv;
4111                         sbuf->cbs_pending_first =
4112                                 cfil_queue_offset_first(&ebuf->cfe_pending_q);
4113                         sbuf->cbs_pending_last =
4114                                 cfil_queue_offset_last(&ebuf->cfe_pending_q);
4115                         sbuf->cbs_ctl_first =
4116                                 cfil_queue_offset_first(&ebuf->cfe_ctl_q);
4117                         sbuf->cbs_ctl_last =
4118                                 cfil_queue_offset_last(&ebuf->cfe_ctl_q);
4119                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
4120                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
4121                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
4122                 }
4123                 error = SYSCTL_OUT(req, &stat,
4124                         sizeof (struct cfil_sock_stat));
4125                 if (error != 0)
4126                         break;
4127         }
4128 done:
4129         cfil_rw_unlock_shared(&cfil_lck_rw);
4130
4131         return (error);
4132 }