bsd/net/content_filter.c

   1 /*
   2  * Copyright (c) 2013-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*
  25  * THEORY OF OPERATION
  26  *
  27  * The socket content filter subsystem provides a way for user space agents to
  28  * make filtering decisions based on the content of the data being sent and
  29  * received by TCP/IP sockets.
  30  *
  31  * A content filter user space agents gets a copy of the data and the data is
  32  * also kept in kernel buffer until the user space agents makes a pass or drop
  33  * decision. This unidirectional flow of content avoids unnecessary data copies
  34  * back to the kernel.
  35  * *
  36  * A user space filter agent opens a kernel control socket with the name
  37  * CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
  38  * When connected, a "struct content_filter" is created and set as the
  39  * "unitinfo" of the corresponding kernel control socket instance.
  40  *
  41  * The socket content filter subsystem exchanges messages with the user space
  42  * filter agent until an ultimate pass or drop decision is made by the
  43  * user space filter agent.
  44  *
  45  * It should be noted that messages about many TCP/IP sockets can be multiplexed
  46  * over a single kernel control socket.
  47  *
  48  * Notes:
  49  * - The current implementation is limited to TCP sockets.
  50  * - The current implementation supports up to two simultaneous content filters
  51  *   for the sake of simplicity of the implementation.
  52  *
  53  *
  54  * NECP FILTER CONTROL UNIT
  55  *
  56  * A user space filter agent uses the Network Extension Control Policy (NECP)
  57  * database specify which TCP/IP sockets needs to be filtered. The NECP
  58  * criteria may be based on a variety of properties like user ID or proc UUID.
  59  *
  60  * The NECP "filter control unit" is used by the socket content filter subsystem
  61  * to deliver the relevant TCP/IP content information to the appropriate
  62  * user space filter agent via its kernel control socket instance.
  63  * This works as follows:
  64  *
  65  * 1) The user space filter agent specifies an NECP filter control unit when
  66  *    in adds its filtering rules to the NECP database.
  67  *
  68  * 2) The user space filter agent also sets its NECP filter control unit on the
  69  *    content filter kernel control socket via the socket option
  70  *    CFIL_OPT_NECP_CONTROL_UNIT.
  71  *
  72  * 3) The NECP database is consulted to find out if a given TCP/IP socket
  73  *    needs to be subjected to content filtering and returns the corresponding
  74  *    NECP filter control unit  -- the NECP filter control unit is actually
  75  *    stored in the TCP/IP socket structure so the NECP lookup is really simple.
  76  *
  77  * 4) The NECP filter control unit is then used to find the corresponding
  78  *    kernel control socket instance.
  79  *
  80  * Note: NECP currently supports a ingle filter control unit per TCP/IP socket
  81  *       but this restriction may be soon lifted.
  82  *
  83  *
  84  * THE MESSAGING PROTOCOL
  85  *
  86  * The socket content filter subsystem and a user space filter agent
  87  * communicate over the kernel control socket via an asynchronous
  88  * messaging protocol (this is not a request-response protocol).
  89  * The socket content filter subsystem sends event messages to the user
  90  * space filter agent about the TCP/IP sockets it is interested to filter.
  91  * The user space filter agent sends action messages to either allow
  92  * data to pass or to disallow the data flow (and drop the connection).
  93  *
  94  * All messages over a content filter kernel control socket share the same
  95  * common header of type "struct cfil_msg_hdr". The message type tells if
  96  * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
  97  * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
  98  * Note the message header length field may be padded for alignment and can
  99  * be larger than the actual content of the message.
 100  * The field "cfm_op" describe the kind of event or action.
 101  *
 102  * Here are the kinds of content filter events:
 103  * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
 104  * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
 105  * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
 106  * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
 107  *
 108  *
 109  * EVENT MESSAGES
 110  *
 111  * The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
 112  * data that is being sent or received. The position of this span of data
 113  * in the data flow is described by a set of start and end offsets. These
 114  * are absolute 64 bits offsets. The first byte sent (or received) starts
 115  * at offset 0 and ends at offset 1. The length of the content data
 116  * is given by the difference between the end offset and the start offset.
 117  *
 118  * After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
 119  * CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
 120  * action message is send by the user space filter agent.
 121  *
 122  * Note: absolute 64 bits offsets should be large enough for the foreseeable
 123  * future.  A 64-bits counter will wrap after 468 years are 10 Gbit/sec:
 124  *   2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
 125  *
 126  * They are two kinds of content filter actions:
 127  * - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
 128  * - CFM_OP_DROP: to shutdown socket and disallow further data flow
 129  *
 130  *
 131  * ACTION MESSAGES
 132  *
 133  * The CFM_OP_DATA_UPDATE action messages let the user space filter
 134  * agent allow data to flow up to the specified pass offset -- there
 135  * is a pass offset for outgoing data and  a pass offset for incoming data.
 136  * When a new TCP/IP socket is attached to the content filter, each pass offset
 137  * is initially set to 0 so not data is allowed to pass by default.
 138  * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 139  * then the data flow becomes unrestricted.
 140  *
 141  * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
 142  * with a pass offset smaller than the pass offset of a previous
 143  * CFM_OP_DATA_UPDATE message is silently ignored.
 144  *
 145  * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
 146  * to tell the kernel how much data it wants to see by using the peek offsets.
 147  * Just like pass offsets, there is a peek offset for each direction.
 148  * When a new TCP/IP socket is attached to the content filter, each peek offset
 149  * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
 150  * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
 151  * with a greater than 0 peek offset is sent by the user space filter agent.
 152  * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 153  * then the flow of update data events becomes unrestricted.
 154  *
 155  * Note that peek offsets cannot be smaller than the corresponding pass offset.
 156  * Also a peek offsets cannot be smaller than the corresponding end offset
 157  * of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
 158  * to set a too small peek value is silently ignored.
 159  *
 160  *
 161  * PER SOCKET "struct cfil_info"
 162  *
 163  * As soon as a TCP/IP socket gets attached to a content filter, a
 164  * "struct cfil_info" is created to hold the content filtering state for this
 165  * socket.
 166  *
 167  * The content filtering state is made of the following information
 168  * for each direction:
 169  * - The current pass offset;
 170  * - The first and last offsets of the data pending, waiting for a filtering
 171  *   decision;
 172  * - The inject queue for data that passed the filters and that needs
 173  *   to be re-injected;
 174  * - A content filter specific state in a set of  "struct cfil_entry"
 175  *
 176  *
 177  * CONTENT FILTER STATE "struct cfil_entry"
 178  *
 179  * The "struct cfil_entry" maintains the information most relevant to the
 180  * message handling over a kernel control socket with a user space filter agent.
 181  *
 182  * The "struct cfil_entry" holds the NECP filter control unit that corresponds
 183  * to the kernel control socket unit it corresponds to and also has a pointer
 184  * to the corresponding "struct content_filter".
 185  *
 186  * For each direction, "struct cfil_entry" maintains the following information:
 187  * - The pass offset
 188  * - The peek offset
 189  * - The offset of the last data peeked at by the filter
 190  * - A queue of data that's waiting to be delivered to the  user space filter
 191  *   agent on the kernel control socket
 192  * - A queue of data for which event messages have been sent on the kernel
 193  *   control socket and are pending for a filtering decision.
 194  *
 195  *
 196  * CONTENT FILTER QUEUES
 197  *
 198  * Data that is being filtered is steered away from the TCP/IP socket buffer
 199  * and instead will sit in one of three content filter queue until the data
 200  * can be re-injected into the TCP/IP socket buffer.
 201  *
 202  * A content filter queue is represented by "struct cfil_queue" that contains
 203  * a list of mbufs and the start and end offset of the data span of
 204  * the list of mbufs.
 205  *
 206  * The data moves into the three content filter queues according to this
 207  * sequence:
 208  * a) The "cfe_ctl_q" of "struct cfil_entry"
 209  * b) The "cfe_pending_q" of "struct cfil_entry"
 210  * c) The "cfi_inject_q" of "struct cfil_info"
 211  *
 212  * Note: The seqyence (a),(b) may be repeated several times if there are more
 213  * than one content filter attached to the TCP/IP socket.
 214  *
 215  * The "cfe_ctl_q" queue holds data than cannot be delivered to the
 216  * kernel conntrol socket for two reasons:
 217  * - The peek offset is less that the end offset of the mbuf data
 218  * - The kernel control socket is flow controlled
 219  *
 220  * The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
 221  * CFM_OP_DATA_IN have been successfully dispatched to the kernel control
 222  * socket and are waiting for a pass action message fromn the user space
 223  * filter agent. An mbuf length must be fully allowed to pass to be removed
 224  * from the cfe_pending_q.
 225  *
 226  * The "cfi_inject_q" queue holds data that has been fully allowed to pass
 227  * by the user space filter agent and that needs to be re-injected into the
 228  * TCP/IP socket.
 229  *
 230  *
 231  * IMPACT ON FLOW CONTROL
 232  *
 233  * An essential aspect of the content filer subsystem is to minimize the
 234  * impact on flow control of the TCP/IP sockets being filtered.
 235  *
 236  * The processing overhead of the content filtering may have an effect on
 237  * flow control by adding noticeable delays and cannot be eliminated --
 238  * care must be taken by the user space filter agent to minimize the
 239  * processing delays.
 240  *
 241  * The amount of data being filtered is kept in buffers while waiting for
 242  * a decision by the user space filter agent. This amount of data pending
 243  * needs to be subtracted from the amount of data available in the
 244  * corresponding TCP/IP socket buffer. This is done by modifying
 245  * sbspace() and tcp_sbspace() to account for amount of data pending
 246  * in the content filter.
 247  *
 248  *
 249  * LOCKING STRATEGY
 250  *
 251  * The global state of content filter subsystem is protected by a single
 252  * read-write lock "cfil_lck_rw". The data flow can be done with the
 253  * cfil read-write lock held as shared so it can be re-entered from multiple
 254  * threads.
 255  *
 256  * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
 257  * protected by the socket lock.
 258  *
 259  * A TCP/IP socket lock cannot be taken while the cfil read-write lock
 260  * is held. That's why we have some sequences where we drop the cfil read-write
 261  * lock before taking the TCP/IP lock.
 262  *
 263  * It is also important to lock the TCP/IP socket buffer while the content
 264  * filter is modifying the amount of pending data. Otherwise the calculations
 265  * in sbspace() and tcp_sbspace()  could be wrong.
 266  *
 267  * The "cfil_lck_rw" protects "struct content_filter" and also the fields
 268  * "cfe_link" and "cfe_filter" of "struct cfil_entry".
 269  *
 270  * Actually "cfe_link" and "cfe_filter" are protected by both by
 271  * "cfil_lck_rw" and the socket lock: they may be modified only when
 272  * "cfil_lck_rw" is exclusive and the socket is locked.
 273  *
 274  * To read the other fields of "struct content_filter" we have to take
 275  * "cfil_lck_rw" in shared mode.
 276  *
 277  *
 278  * LIMITATIONS
 279  *
 280  * - For TCP sockets only
 281  *
 282  * - Does not support TCP unordered messages
 283  */
 284
 285 /*
 286  *      TO DO LIST
 287  *
 288  *      SOONER:
 289  *
 290  *      Deal with OOB
 291  *
 292  *      LATER:
 293  *
 294  *      If support datagram, enqueue control and address mbufs as well
 295  */
 296
 297 #include <sys/types.h>
 298 #include <sys/kern_control.h>
 299 #include <sys/queue.h>
 300 #include <sys/domain.h>
 301 #include <sys/protosw.h>
 302 #include <sys/syslog.h>
 303
 304 #include <kern/locks.h>
 305 #include <kern/zalloc.h>
 306 #include <kern/debug.h>
 307
 308 #include <net/content_filter.h>
 309
 310 #include <netinet/in_pcb.h>
 311 #include <netinet/tcp.h>
 312 #include <netinet/tcp_var.h>
 313
 314 #include <string.h>
 315 #include <libkern/libkern.h>
 316
 317
 318 #define MAX_CONTENT_FILTER 2
 319
 320 struct cfil_entry;
 321
 322 /*
 323  * The structure content_filter represents a user space content filter
 324  * It's created and associated with a kernel control socket instance
 325  */
 326 struct content_filter {
 327         kern_ctl_ref            cf_kcref;
 328         u_int32_t               cf_kcunit;
 329         u_int32_t               cf_flags;
 330
 331         uint32_t                cf_necp_control_unit;
 332
 333         uint32_t                cf_sock_count;
 334         TAILQ_HEAD(, cfil_entry) cf_sock_entries;
 335 };
 336
 337 #define CFF_ACTIVE              0x01
 338 #define CFF_DETACHING           0x02
 339 #define CFF_FLOW_CONTROLLED     0x04
 340
 341 struct content_filter **content_filters = NULL;
 342 uint32_t cfil_active_count = 0; /* Number of active content filters */
 343 uint32_t cfil_sock_attached_count = 0;  /* Number of sockets attachements */
 344 uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
 345
 346 static kern_ctl_ref cfil_kctlref = NULL;
 347
 348 static lck_grp_attr_t *cfil_lck_grp_attr = NULL;
 349 static lck_attr_t *cfil_lck_attr = NULL;
 350 static lck_grp_t *cfil_lck_grp = NULL;
 351 decl_lck_rw_data(static, cfil_lck_rw);
 352
 353 #define CFIL_RW_LCK_MAX 8
 354
 355 int cfil_rw_nxt_lck = 0;
 356 void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
 357
 358 int cfil_rw_nxt_unlck = 0;
 359 void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
 360
 361 #define CONTENT_FILTER_ZONE_NAME        "content_filter"
 362 #define CONTENT_FILTER_ZONE_MAX         10
 363 static struct zone *content_filter_zone = NULL; /* zone for content_filter */
 364
 365
 366 #define CFIL_INFO_ZONE_NAME     "cfil_info"
 367 #define CFIL_INFO_ZONE_MAX      1024
 368 static struct zone *cfil_info_zone = NULL;      /* zone for cfil_info */
 369
 370 MBUFQ_HEAD(cfil_mqhead);
 371
 372 struct cfil_queue {
 373         uint64_t                q_start; /* offset of first byte in queue */
 374         uint64_t                q_end; /* offset of last byte in queue */
 375         struct cfil_mqhead      q_mq;
 376 };
 377
 378 /*
 379  * struct cfil_entry
 380  *
 381  * The is one entry per content filter
 382  */
 383 struct cfil_entry {
 384         TAILQ_ENTRY(cfil_entry) cfe_link;
 385         struct content_filter   *cfe_filter;
 386
 387         struct cfil_info        *cfe_cfil_info;
 388         uint32_t                cfe_flags;
 389         uint32_t                cfe_necp_control_unit;
 390         struct timeval          cfe_last_event; /* To user space */
 391         struct timeval          cfe_last_action; /* From user space */
 392
 393         struct cfe_buf {
 394                 /*
 395                  * cfe_pending_q holds data that has been delivered to
 396                  * the filter and for which we are waiting for an action
 397                  */
 398                 struct cfil_queue       cfe_pending_q;
 399                 /*
 400                  * This queue is for data that has not be delivered to
 401                  * the content filter (new data, pass peek or flow control)
 402                  */
 403                 struct cfil_queue       cfe_ctl_q;
 404
 405                 uint64_t                cfe_pass_offset;
 406                 uint64_t                cfe_peek_offset;
 407                 uint64_t                cfe_peeked;
 408         } cfe_snd, cfe_rcv;
 409 };
 410
 411 #define CFEF_CFIL_ATTACHED              0x0001  /* was attached to filter */
 412 #define CFEF_SENT_SOCK_ATTACHED         0x0002  /* sock attach event was sent */
 413 #define CFEF_DATA_START                 0x0004  /* can send data event */
 414 #define CFEF_FLOW_CONTROLLED            0x0008  /* wait for flow control lift */
 415 #define CFEF_SENT_DISCONNECT_IN         0x0010  /* event was sent */
 416 #define CFEF_SENT_DISCONNECT_OUT        0x0020  /* event was sent */
 417 #define CFEF_SENT_SOCK_CLOSED           0x0040  /* closed event was sent */
 418 #define CFEF_CFIL_DETACHED              0x0080  /* filter was detached */
 419
 420 /*
 421  * struct cfil_info
 422  *
 423  * There is a struct cfil_info per socket
 424  */
 425 struct cfil_info {
 426         TAILQ_ENTRY(cfil_info)  cfi_link;
 427         struct socket           *cfi_so;
 428         uint64_t                cfi_flags;
 429         uint64_t                cfi_sock_id;
 430
 431         struct cfi_buf {
 432                 /*
 433                  * cfi_pending_first and cfi_pending_last describe the total
 434                  * amount of data outstanding for all the filters on
 435                  * this socket and data in the flow queue
 436                  * cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
 437                  */
 438                 uint64_t                cfi_pending_first;
 439                 uint64_t                cfi_pending_last;
 440                 int                     cfi_pending_mbcnt;
 441                 /*
 442                  * cfi_pass_offset is the minimum of all the filters
 443                  */
 444                 uint64_t                cfi_pass_offset;
 445                 /*
 446                  * cfi_inject_q holds data that needs to be re-injected
 447                  * into the socket after filtering and that can
 448                  * be queued because of flow control
 449                  */
 450                 struct cfil_queue       cfi_inject_q;
 451         } cfi_snd, cfi_rcv;
 452
 453         struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
 454 };
 455
 456 #define CFIF_DROP               0x0001  /* drop action applied */
 457 #define CFIF_CLOSE_WAIT         0x0002  /* waiting for filter to close */
 458 #define CFIF_SOCK_CLOSED        0x0004  /* socket is closed */
 459 #define CFIF_RETRY_INJECT_IN    0x0010  /* inject in failed */
 460 #define CFIF_RETRY_INJECT_OUT   0x0020  /* inject out failed */
 461 #define CFIF_SHUT_WR            0x0040  /* shutdown write */
 462 #define CFIF_SHUT_RD            0x0080  /* shutdown read */
 463
 464 #define CFI_MASK_GENCNT         0xFFFFFFFF00000000      /* upper 32 bits */
 465 #define CFI_SHIFT_GENCNT        32
 466 #define CFI_MASK_FLOWHASH       0x00000000FFFFFFFF      /* lower 32 bits */
 467 #define CFI_SHIFT_FLOWHASH      0
 468
 469 TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 470
 471 #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
 472 #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
 473
 474 /*
 475  * Statistics
 476  */
 477
 478 struct cfil_stats cfil_stats;
 479
 480 /*
 481  * For troubleshooting
 482  */
 483 int cfil_log_level = LOG_ERR;
 484 int cfil_debug = 1;
 485
 486 /*
 487  * Sysctls for logs and statistics
 488  */
 489 static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
 490         struct sysctl_req *);
 491 static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
 492         struct sysctl_req *);
 493
 494 SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cfil");
 495
 496 SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW|CTLFLAG_LOCKED,
 497         &cfil_log_level, 0, "");
 498
 499 SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW|CTLFLAG_LOCKED,
 500         &cfil_debug, 0, "");
 501
 502 SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD|CTLFLAG_LOCKED,
 503         &cfil_sock_attached_count, 0, "");
 504
 505 SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD|CTLFLAG_LOCKED,
 506         &cfil_active_count, 0, "");
 507
 508 SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW|CTLFLAG_LOCKED,
 509         &cfil_close_wait_timeout, 0, "");
 510
 511 static int cfil_sbtrim = 1;
 512 SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW|CTLFLAG_LOCKED,
 513         &cfil_sbtrim, 0, "");
 514
 515 SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD|CTLFLAG_LOCKED,
 516         0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat",  "");
 517
 518 SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD|CTLFLAG_LOCKED,
 519         0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat",  "");
 520
 521 SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED,
 522         &cfil_stats, cfil_stats, "");
 523
 524 /*
 525  * Forward declaration to appease the compiler
 526  */
 527 static int cfil_action_data_pass(struct socket *, uint32_t, int,
 528         uint64_t, uint64_t);
 529 static int cfil_action_drop(struct socket *, uint32_t);
 530 static int cfil_dispatch_closed_event(struct socket *, int);
 531 static int cfil_data_common(struct socket *, int, struct sockaddr *,
 532         struct mbuf *, struct mbuf *, uint32_t);
 533 static int cfil_data_filter(struct socket *, uint32_t, int,
 534         struct mbuf *, uint64_t);
 535 static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
 536         struct in_addr, u_int16_t);
 537 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
 538         struct in6_addr *, u_int16_t);
 539 static int cfil_dispatch_attach_event(struct socket *, uint32_t);
 540 static void cfil_info_free(struct socket *, struct cfil_info *);
 541 static struct cfil_info * cfil_info_alloc(struct socket *);
 542 static int cfil_info_attach_unit(struct socket *, uint32_t);
 543 static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t);
 544 static int cfil_service_pending_queue(struct socket *, uint32_t, int);
 545 static int cfil_data_service_ctl_q(struct socket *, uint32_t, int);
 546 static void cfil_info_verify(struct cfil_info *);
 547 static int cfil_update_data_offsets(struct socket *, uint32_t, int,
 548         uint64_t, uint64_t);
 549 static int cfil_acquire_sockbuf(struct socket *, int);
 550 static void cfil_release_sockbuf(struct socket *, int);
 551 static int cfil_filters_attached(struct socket *);
 552
 553 static void cfil_rw_lock_exclusive(lck_rw_t *);
 554 static void cfil_rw_unlock_exclusive(lck_rw_t *);
 555 static void cfil_rw_lock_shared(lck_rw_t *);
 556 static void cfil_rw_unlock_shared(lck_rw_t *);
 557 static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
 558 static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
 559
 560 static unsigned int cfil_data_length(struct mbuf *, int *);
 561
 562 /*
 563  * Content filter global read write lock
 564  */
 565
 566 static void
 567 cfil_rw_lock_exclusive(lck_rw_t *lck)
 568 {
 569         void *lr_saved;
 570
 571         lr_saved = __builtin_return_address(0);
 572
 573         lck_rw_lock_exclusive(lck);
 574
 575         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 576         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 577 }
 578
 579 static void
 580 cfil_rw_unlock_exclusive(lck_rw_t *lck)
 581 {
 582         void *lr_saved;
 583
 584         lr_saved = __builtin_return_address(0);
 585
 586         lck_rw_unlock_exclusive(lck);
 587
 588         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 589         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 590 }
 591
 592 static void
 593 cfil_rw_lock_shared(lck_rw_t *lck)
 594 {
 595         void *lr_saved;
 596
 597         lr_saved = __builtin_return_address(0);
 598
 599         lck_rw_lock_shared(lck);
 600
 601         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 602         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 603 }
 604
 605 static void
 606 cfil_rw_unlock_shared(lck_rw_t *lck)
 607 {
 608         void *lr_saved;
 609
 610         lr_saved = __builtin_return_address(0);
 611
 612         lck_rw_unlock_shared(lck);
 613
 614         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 615         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 616 }
 617
 618 static boolean_t
 619 cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
 620 {
 621         void *lr_saved;
 622         boolean_t upgraded;
 623
 624         lr_saved = __builtin_return_address(0);
 625
 626         upgraded = lck_rw_lock_shared_to_exclusive(lck);
 627         if (upgraded) {
 628                 cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 629                 cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 630         }
 631         return (upgraded);
 632 }
 633
 634 static void
 635 cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
 636 {
 637         void *lr_saved;
 638
 639         lr_saved = __builtin_return_address(0);
 640
 641         lck_rw_lock_exclusive_to_shared(lck);
 642
 643         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 644         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 645 }
 646
 647 static void
 648 cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
 649 {
 650         lck_rw_assert(lck,
 651             exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
 652 }
 653
 654 static void
 655 socket_lock_assert_owned(struct socket *so)
 656 {
 657         lck_mtx_t *mutex_held;
 658
 659         if (so->so_proto->pr_getlock != NULL)
 660                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 661         else
 662                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 663
 664         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 665 }
 666
 667 /*
 668  * Return the number of bytes in the mbuf chain using the same
 669  * method as m_length() or sballoc()
 670  */
 671 static unsigned int
 672 cfil_data_length(struct mbuf *m, int *retmbcnt)
 673 {
 674         struct mbuf *m0;
 675         unsigned int pktlen;
 676         int mbcnt;
 677
 678         if (retmbcnt == NULL)
 679                 return (m_length(m));
 680
 681         pktlen = 0;
 682         mbcnt = 0;
 683         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
 684                 pktlen += m0->m_len;
 685                 mbcnt += MSIZE;
 686                 if (m0->m_flags & M_EXT)
 687                         mbcnt += m0->m_ext.ext_size;
 688         }
 689         *retmbcnt = mbcnt;
 690         return (pktlen);
 691 }
 692
 693 /*
 694  * Common mbuf queue utilities
 695  */
 696
 697 static inline void
 698 cfil_queue_init(struct cfil_queue *cfq)
 699 {
 700         cfq->q_start = 0;
 701         cfq->q_end = 0;
 702         MBUFQ_INIT(&cfq->q_mq);
 703 }
 704
 705 static inline uint64_t
 706 cfil_queue_drain(struct cfil_queue *cfq)
 707 {
 708         uint64_t drained = cfq->q_start - cfq->q_end;
 709         cfq->q_start = 0;
 710         cfq->q_end = 0;
 711         MBUFQ_DRAIN(&cfq->q_mq);
 712
 713         return (drained);
 714 }
 715
 716 /* Return 1 when empty, 0 otherwise */
 717 static inline int
 718 cfil_queue_empty(struct cfil_queue *cfq)
 719 {
 720         return (MBUFQ_EMPTY(&cfq->q_mq));
 721 }
 722
 723 static inline uint64_t
 724 cfil_queue_offset_first(struct cfil_queue *cfq)
 725 {
 726         return (cfq->q_start);
 727 }
 728
 729 static inline uint64_t
 730 cfil_queue_offset_last(struct cfil_queue *cfq)
 731 {
 732         return (cfq->q_end);
 733 }
 734
 735 static inline uint64_t
 736 cfil_queue_len(struct cfil_queue *cfq)
 737 {
 738         return (cfq->q_end - cfq->q_start);
 739 }
 740
 741 /*
 742  * Routines to verify some fundamental assumptions
 743  */
 744
 745 static void
 746 cfil_queue_verify(struct cfil_queue *cfq)
 747 {
 748         mbuf_t m;
 749         mbuf_t n;
 750         uint64_t queuesize = 0;
 751
 752         /* Verify offset are ordered */
 753         VERIFY(cfq->q_start <= cfq->q_end);
 754
 755         /*
 756          * When queue is empty, the offsets are equal otherwise the offsets
 757          * are different
 758          */
 759         VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
 760                 (!MBUFQ_EMPTY(&cfq->q_mq) &&
 761                 cfq->q_start != cfq->q_end));
 762
 763         MBUFQ_FOREACH(m, &cfq->q_mq) {
 764                 size_t chainsize = 0;
 765                 unsigned int mlen = m_length(m);
 766
 767                 if (m == (void *)M_TAG_FREE_PATTERN ||
 768                         m->m_next == (void *)M_TAG_FREE_PATTERN ||
 769                         m->m_nextpkt == (void *)M_TAG_FREE_PATTERN)
 770                         panic("%s - mq %p is free at %p", __func__,
 771                                 &cfq->q_mq, m);
 772                 for (n = m; n != NULL; n = n->m_next) {
 773                         if (n->m_type != MT_DATA &&
 774                                 n->m_type != MT_HEADER &&
 775                                 n->m_type != MT_OOBDATA)
 776                         panic("%s - %p unsupported type %u", __func__,
 777                                 n, n->m_type);
 778                         chainsize += n->m_len;
 779                 }
 780                 if (mlen != chainsize)
 781                         panic("%s - %p m_length() %u != chainsize %lu",
 782                                 __func__, m, mlen, chainsize);
 783                 queuesize += chainsize;
 784         }
 785         if (queuesize != cfq->q_end - cfq->q_start)
 786                 panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
 787                         m, queuesize, cfq->q_end - cfq->q_start);
 788 }
 789
 790 static void
 791 cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
 792 {
 793         CFIL_QUEUE_VERIFY(cfq);
 794
 795         MBUFQ_ENQUEUE(&cfq->q_mq, m);
 796         cfq->q_end += len;
 797
 798         CFIL_QUEUE_VERIFY(cfq);
 799 }
 800
 801 static void
 802 cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
 803 {
 804         CFIL_QUEUE_VERIFY(cfq);
 805
 806         VERIFY(m_length(m) == len);
 807
 808         MBUFQ_REMOVE(&cfq->q_mq, m);
 809         MBUFQ_NEXT(m) = NULL;
 810         cfq->q_start += len;
 811
 812         CFIL_QUEUE_VERIFY(cfq);
 813 }
 814
 815 static mbuf_t
 816 cfil_queue_first(struct cfil_queue *cfq)
 817 {
 818         return (MBUFQ_FIRST(&cfq->q_mq));
 819 }
 820
 821 static mbuf_t
 822 cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
 823 {
 824 #pragma unused(cfq)
 825         return (MBUFQ_NEXT(m));
 826 }
 827
 828 static void
 829 cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
 830 {
 831         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
 832         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
 833
 834         /* Verify the queues are ordered so that pending is before ctl */
 835         VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
 836
 837         /* The peek offset cannot be less than the pass offset */
 838         VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
 839
 840         /* Make sure we've updated the offset we peeked at  */
 841         VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
 842 }
 843
 844 static void
 845 cfil_entry_verify(struct cfil_entry *entry)
 846 {
 847         cfil_entry_buf_verify(&entry->cfe_snd);
 848         cfil_entry_buf_verify(&entry->cfe_rcv);
 849 }
 850
 851 static void
 852 cfil_info_buf_verify(struct cfi_buf *cfi_buf)
 853 {
 854         CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
 855
 856         VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
 857         VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
 858 }
 859
 860 static void
 861 cfil_info_verify(struct cfil_info *cfil_info)
 862 {
 863         int i;
 864
 865         if (cfil_info == NULL)
 866                 return;
 867
 868         cfil_info_buf_verify(&cfil_info->cfi_snd);
 869         cfil_info_buf_verify(&cfil_info->cfi_rcv);
 870
 871         for (i = 0; i < MAX_CONTENT_FILTER; i++)
 872                 cfil_entry_verify(&cfil_info->cfi_entries[i]);
 873 }
 874
 875 static void
 876 verify_content_filter(struct content_filter *cfc)
 877 {
 878         struct cfil_entry *entry;
 879         uint32_t count = 0;
 880
 881         VERIFY(cfc->cf_sock_count >= 0);
 882
 883         TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
 884                 count++;
 885                 VERIFY(cfc == entry->cfe_filter);
 886         }
 887         VERIFY(count == cfc->cf_sock_count);
 888 }
 889
 890 /*
 891  * Kernel control socket callbacks
 892  */
 893 static errno_t
 894 cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
 895                 void **unitinfo)
 896 {
 897         errno_t error = 0;
 898         struct content_filter *cfc = NULL;
 899
 900         CFIL_LOG(LOG_NOTICE, "");
 901
 902         cfc = zalloc(content_filter_zone);
 903         if (cfc == NULL) {
 904                 CFIL_LOG(LOG_ERR, "zalloc failed");
 905                 error = ENOMEM;
 906                 goto done;
 907         }
 908         bzero(cfc, sizeof(struct content_filter));
 909
 910         cfil_rw_lock_exclusive(&cfil_lck_rw);
 911         if (content_filters == NULL) {
 912                 struct content_filter **tmp;
 913
 914                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
 915
 916                 MALLOC(tmp,
 917                         struct content_filter **,
 918                         MAX_CONTENT_FILTER * sizeof(struct content_filter *),
 919                         M_TEMP,
 920                         M_WAITOK | M_ZERO);
 921
 922                 cfil_rw_lock_exclusive(&cfil_lck_rw);
 923
 924                 if (tmp == NULL && content_filters == NULL) {
 925                         error = ENOMEM;
 926                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
 927                         goto done;
 928                 }
 929                 /* Another thread may have won the race */
 930                 if (content_filters != NULL)
 931                         FREE(tmp, M_TEMP);
 932                 else
 933                         content_filters = tmp;
 934         }
 935
 936         if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
 937                 CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
 938                 error = EINVAL;
 939         } else if (content_filters[sac->sc_unit - 1] != NULL) {
 940                 CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
 941                 error = EADDRINUSE;
 942         } else {
 943                 /*
 944                  * kernel control socket kcunit numbers start at 1
 945                  */
 946                 content_filters[sac->sc_unit - 1] = cfc;
 947
 948                 cfc->cf_kcref = kctlref;
 949                 cfc->cf_kcunit = sac->sc_unit;
 950                 TAILQ_INIT(&cfc->cf_sock_entries);
 951
 952                 *unitinfo = cfc;
 953                 cfil_active_count++;
 954         }
 955         cfil_rw_unlock_exclusive(&cfil_lck_rw);
 956 done:
 957         if (error != 0 && cfc != NULL)
 958                 zfree(content_filter_zone, cfc);
 959
 960         if (error == 0)
 961                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
 962         else
 963                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
 964
 965         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
 966                 error, cfil_active_count, sac->sc_unit);
 967
 968         return (error);
 969 }
 970
 971 static errno_t
 972 cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
 973 {
 974 #pragma unused(kctlref)
 975         errno_t error = 0;
 976         struct content_filter *cfc;
 977         struct cfil_entry *entry;
 978
 979         CFIL_LOG(LOG_NOTICE, "");
 980
 981         if (content_filters == NULL) {
 982                 CFIL_LOG(LOG_ERR, "no content filter");
 983                 error = EINVAL;
 984                 goto done;
 985         }
 986         if (kcunit > MAX_CONTENT_FILTER) {
 987                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
 988                         kcunit, MAX_CONTENT_FILTER);
 989                 error = EINVAL;
 990                 goto done;
 991         }
 992
 993         cfc = (struct content_filter *)unitinfo;
 994         if (cfc == NULL)
 995                 goto done;
 996
 997         cfil_rw_lock_exclusive(&cfil_lck_rw);
 998         if (content_filters[kcunit - 1] != cfc || cfc->cf_kcunit != kcunit) {
 999                 CFIL_LOG(LOG_ERR, "bad unit info %u)",
1000                         kcunit);
1001                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
1002                 goto done;
1003         }
1004         cfc->cf_flags |= CFF_DETACHING;
1005         /*
1006          * Remove all sockets from the filter
1007          */
1008         while ((entry = TAILQ_FIRST(&cfc->cf_sock_entries)) != NULL) {
1009                 cfil_rw_lock_assert_held(&cfil_lck_rw, 1);
1010
1011                 verify_content_filter(cfc);
1012                 /*
1013                  * Accept all outstanding data by pushing to next filter
1014                  * or back to socket
1015                  *
1016                  * TBD: Actually we should make sure all data has been pushed
1017                  * back to socket
1018                  */
1019                 if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
1020                         struct cfil_info *cfil_info = entry->cfe_cfil_info;
1021                         struct socket *so = cfil_info->cfi_so;
1022
1023                         /* Need to let data flow immediately */
1024                         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
1025                                 CFEF_DATA_START;
1026
1027                         /*
1028                          * Respect locking hierarchy
1029                          */
1030                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1031
1032                         socket_lock(so, 1);
1033
1034                         /*
1035                          * When cfe_filter is NULL the filter is detached
1036                          * and the entry has been removed from cf_sock_entries
1037                          */
1038                         if (so->so_cfil == NULL || entry->cfe_filter == NULL) {
1039                                 cfil_rw_lock_exclusive(&cfil_lck_rw);
1040                                 goto release;
1041                         }
1042                         (void) cfil_action_data_pass(so, kcunit, 1,
1043                                         CFM_MAX_OFFSET,
1044                                         CFM_MAX_OFFSET);
1045
1046                         (void) cfil_action_data_pass(so, kcunit, 0,
1047                                         CFM_MAX_OFFSET,
1048                                         CFM_MAX_OFFSET);
1049
1050                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1051
1052                         /*
1053                          * Check again as the socket may have been unlocked
1054                          * when when calling cfil_acquire_sockbuf()
1055                          */
1056                         if (so->so_cfil == NULL || entry->cfe_filter == NULL)
1057                                 goto release;
1058
1059                         /* The filter is now detached */
1060                         entry->cfe_flags |= CFEF_CFIL_DETACHED;
1061                         CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
1062                                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1063
1064                         if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
1065                             cfil_filters_attached(so) == 0) {
1066                                 CFIL_LOG(LOG_NOTICE, "so %llx waking",
1067                                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1068                                 wakeup((caddr_t)&so->so_cfil);
1069                         }
1070
1071                         /*
1072                          * Remove the filter entry from the content filter
1073                          * but leave the rest of the state intact as the queues
1074                          * may not be empty yet
1075                          */
1076                         entry->cfe_filter = NULL;
1077                         entry->cfe_necp_control_unit = 0;
1078
1079                         TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1080                         cfc->cf_sock_count--;
1081 release:
1082                         socket_unlock(so, 1);
1083                 }
1084         }
1085         verify_content_filter(cfc);
1086
1087         VERIFY(cfc->cf_sock_count == 0);
1088
1089         /*
1090          * Make filter inactive
1091          */
1092         content_filters[kcunit - 1] = NULL;
1093         cfil_active_count--;
1094         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1095
1096         zfree(content_filter_zone, cfc);
1097 done:
1098         if (error == 0)
1099                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_ok);
1100         else
1101                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_fail);
1102
1103         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1104                 error, cfil_active_count, kcunit);
1105
1106         return (error);
1107 }
1108
1109 /*
1110  * cfil_acquire_sockbuf()
1111  *
1112  * Prevent any other thread from acquiring the sockbuf
1113  * We use sb_cfil_thread as a semaphore to prevent other threads from
1114  * messing with the sockbuf -- see sblock()
1115  * Note: We do not set SB_LOCK here because the thread may check or modify
1116  * SB_LOCK several times until it calls cfil_release_sockbuf() -- currently
1117  * sblock(), sbunlock() or sodefunct()
1118  */
1119 static int
1120 cfil_acquire_sockbuf(struct socket *so, int outgoing)
1121 {
1122         thread_t tp = current_thread();
1123         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1124         lck_mtx_t *mutex_held;
1125         int error = 0;
1126
1127         /*
1128          * Wait until no thread is holding the sockbuf and other content
1129          * filter threads have released the sockbuf
1130          */
1131         while ((sb->sb_flags & SB_LOCK) ||
1132                 (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)) {
1133                 if (so->so_proto->pr_getlock != NULL)
1134                         mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1135                 else
1136                         mutex_held = so->so_proto->pr_domain->dom_mtx;
1137
1138                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1139
1140                 sb->sb_wantlock++;
1141                 VERIFY(sb->sb_wantlock != 0);
1142
1143                 msleep(&sb->sb_flags, mutex_held, PSOCK, "cfil_acquire_sockbuf",
1144                         NULL);
1145
1146                 VERIFY(sb->sb_wantlock != 0);
1147                 sb->sb_wantlock--;
1148         }
1149         /*
1150          * Use reference count for repetitive calls on same thread
1151          */
1152         if (sb->sb_cfil_refs == 0) {
1153                 VERIFY(sb->sb_cfil_thread == NULL);
1154                 VERIFY((sb->sb_flags & SB_LOCK) == 0);
1155
1156                 sb->sb_cfil_thread = tp;
1157                 sb->sb_flags |= SB_LOCK;
1158         }
1159         sb->sb_cfil_refs++;
1160
1161         /* We acquire the socket buffer when we need to cleanup */
1162         if (so->so_cfil == NULL) {
1163                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
1164                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1165                 error = 0;
1166         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1167                 CFIL_LOG(LOG_ERR, "so %llx drop set",
1168                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1169                 error = EPIPE;
1170         }
1171
1172         return (error);
1173 }
1174
1175 static void
1176 cfil_release_sockbuf(struct socket *so, int outgoing)
1177 {
1178         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1179         thread_t tp = current_thread();
1180
1181         socket_lock_assert_owned(so);
1182
1183         if (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)
1184                 panic("%s sb_cfil_thread %p not current %p", __func__,
1185                         sb->sb_cfil_thread, tp);
1186         /*
1187          * Don't panic if we are defunct because SB_LOCK has
1188          * been cleared by sodefunct()
1189          */
1190         if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK))
1191                 panic("%s SB_LOCK not set on %p", __func__,
1192                         sb);
1193         /*
1194          * We can unlock when the thread unwinds to the last reference
1195          */
1196         sb->sb_cfil_refs--;
1197         if (sb->sb_cfil_refs == 0) {
1198                 sb->sb_cfil_thread = NULL;
1199                 sb->sb_flags &= ~SB_LOCK;
1200
1201                 if (sb->sb_wantlock > 0)
1202                         wakeup(&sb->sb_flags);
1203         }
1204 }
1205
1206 cfil_sock_id_t
1207 cfil_sock_id_from_socket(struct socket *so)
1208 {
1209         if ((so->so_flags & SOF_CONTENT_FILTER) && so->so_cfil)
1210                 return (so->so_cfil->cfi_sock_id);
1211         else
1212                 return (CFIL_SOCK_ID_NONE);
1213 }
1214
1215 static struct socket *
1216 cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id)
1217 {
1218         struct socket *so = NULL;
1219         u_int64_t gencnt = cfil_sock_id >> 32;
1220         u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
1221         struct inpcb *inp = NULL;
1222         struct inpcbinfo *pcbinfo = &tcbinfo;
1223
1224         lck_rw_lock_shared(pcbinfo->ipi_lock);
1225         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1226                 if (inp->inp_state != INPCB_STATE_DEAD &&
1227                         inp->inp_socket != NULL &&
1228                         inp->inp_flowhash == flowhash &&
1229                         (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
1230                         inp->inp_socket->so_cfil != NULL) {
1231                         so = inp->inp_socket;
1232                         break;
1233                 }
1234         }
1235         lck_rw_done(pcbinfo->ipi_lock);
1236
1237         if (so == NULL) {
1238                 OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
1239                 CFIL_LOG(LOG_DEBUG,
1240                         "no socket for sock_id %llx gencnt %llx flowhash %x",
1241                         cfil_sock_id, gencnt, flowhash);
1242         }
1243
1244         return (so);
1245 }
1246
1247 static errno_t
1248 cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
1249                 int flags)
1250 {
1251 #pragma unused(kctlref, flags)
1252         errno_t error = 0;
1253         struct cfil_msg_hdr *msghdr;
1254         struct content_filter *cfc = (struct content_filter *)unitinfo;
1255         struct socket *so;
1256         struct cfil_msg_action *action_msg;
1257         struct cfil_entry *entry;
1258
1259         CFIL_LOG(LOG_INFO, "");
1260
1261         if (content_filters == NULL) {
1262                 CFIL_LOG(LOG_ERR, "no content filter");
1263                 error = EINVAL;
1264                 goto done;
1265         }
1266         if (kcunit > MAX_CONTENT_FILTER) {
1267                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1268                         kcunit, MAX_CONTENT_FILTER);
1269                 error = EINVAL;
1270                 goto done;
1271         }
1272
1273         if (m_length(m) < sizeof(struct cfil_msg_hdr)) {
1274                 CFIL_LOG(LOG_ERR, "too short %u", m_length(m));
1275                 error = EINVAL;
1276                 goto done;
1277         }
1278         msghdr = (struct cfil_msg_hdr *)mbuf_data(m);
1279         if (msghdr->cfm_version != CFM_VERSION_CURRENT) {
1280                 CFIL_LOG(LOG_ERR, "bad version %u", msghdr->cfm_version);
1281                 error = EINVAL;
1282                 goto done;
1283         }
1284         if (msghdr->cfm_type != CFM_TYPE_ACTION) {
1285                 CFIL_LOG(LOG_ERR, "bad type %u", msghdr->cfm_type);
1286                 error = EINVAL;
1287                 goto done;
1288         }
1289         /* Validate action operation */
1290         switch (msghdr->cfm_op) {
1291                 case CFM_OP_DATA_UPDATE:
1292                         OSIncrementAtomic(
1293                                 &cfil_stats.cfs_ctl_action_data_update);
1294                         break;
1295                 case CFM_OP_DROP:
1296                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_drop);
1297                         break;
1298                 default:
1299                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
1300                         CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
1301                         error = EINVAL;
1302                         goto done;
1303                 }
1304                 if (msghdr->cfm_len != sizeof(struct cfil_msg_action)) {
1305                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1306                                 error = EINVAL;
1307                                 CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1308                                         msghdr->cfm_len,
1309                                         msghdr->cfm_op);
1310                                 goto done;
1311                         }
1312         cfil_rw_lock_shared(&cfil_lck_rw);
1313         if (cfc != (void *)content_filters[kcunit - 1]) {
1314                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1315                         kcunit);
1316                 error = EINVAL;
1317                 cfil_rw_unlock_shared(&cfil_lck_rw);
1318                 goto done;
1319         }
1320
1321         so = cfil_socket_from_sock_id(msghdr->cfm_sock_id);
1322         if (so == NULL) {
1323                 CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
1324                         msghdr->cfm_sock_id);
1325                 error = EINVAL;
1326                 cfil_rw_unlock_shared(&cfil_lck_rw);
1327                 goto done;
1328         }
1329         cfil_rw_unlock_shared(&cfil_lck_rw);
1330
1331         socket_lock(so, 1);
1332
1333         if (so->so_cfil == NULL) {
1334                 CFIL_LOG(LOG_NOTICE, "so %llx not attached",
1335                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1336                 error = EINVAL;
1337                 goto unlock;
1338         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1339                 CFIL_LOG(LOG_NOTICE, "so %llx drop set",
1340                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1341                 error = EINVAL;
1342                 goto unlock;
1343         }
1344         entry = &so->so_cfil->cfi_entries[kcunit - 1];
1345         if (entry->cfe_filter == NULL) {
1346                 CFIL_LOG(LOG_NOTICE, "so %llx no filter",
1347                         (uint64_t)VM_KERNEL_ADDRPERM(so));
1348                 error = EINVAL;
1349                 goto unlock;
1350         }
1351
1352         if (entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED)
1353                 entry->cfe_flags |= CFEF_DATA_START;
1354         else {
1355                 CFIL_LOG(LOG_ERR,
1356                         "so %llx attached not sent for %u",
1357                         (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1358                 error = EINVAL;
1359                 goto unlock;
1360         }
1361
1362         microuptime(&entry->cfe_last_action);
1363
1364         action_msg = (struct cfil_msg_action *)msghdr;
1365
1366         switch (msghdr->cfm_op) {
1367                 case CFM_OP_DATA_UPDATE:
1368                         if (action_msg->cfa_out_peek_offset != 0 ||
1369                                 action_msg->cfa_out_pass_offset != 0)
1370                                 error = cfil_action_data_pass(so, kcunit, 1,
1371                                         action_msg->cfa_out_pass_offset,
1372                                         action_msg->cfa_out_peek_offset);
1373                         if (error == EJUSTRETURN)
1374                                 error = 0;
1375                         if (error != 0)
1376                                 break;
1377                         if (action_msg->cfa_in_peek_offset != 0 ||
1378                                 action_msg->cfa_in_pass_offset != 0)
1379                                 error = cfil_action_data_pass(so, kcunit, 0,
1380                                         action_msg->cfa_in_pass_offset,
1381                                         action_msg->cfa_in_peek_offset);
1382                         if (error == EJUSTRETURN)
1383                                 error = 0;
1384                         break;
1385
1386                 case CFM_OP_DROP:
1387                         error = cfil_action_drop(so, kcunit);
1388                         break;
1389
1390                 default:
1391                         error = EINVAL;
1392                         break;
1393         }
1394 unlock:
1395         socket_unlock(so, 1);
1396 done:
1397         mbuf_freem(m);
1398
1399         if (error == 0)
1400                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_ok);
1401         else
1402                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_bad);
1403
1404         return (error);
1405 }
1406
1407 static errno_t
1408 cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1409                 int opt, void *data, size_t *len)
1410 {
1411 #pragma unused(kctlref, opt)
1412         errno_t error = 0;
1413         struct content_filter *cfc = (struct content_filter *)unitinfo;
1414
1415         CFIL_LOG(LOG_NOTICE, "");
1416
1417         cfil_rw_lock_shared(&cfil_lck_rw);
1418
1419         if (content_filters == NULL) {
1420                 CFIL_LOG(LOG_ERR, "no content filter");
1421                 error = EINVAL;
1422                 goto done;
1423         }
1424         if (kcunit > MAX_CONTENT_FILTER) {
1425                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1426                         kcunit, MAX_CONTENT_FILTER);
1427                 error = EINVAL;
1428                 goto done;
1429         }
1430         if (cfc != (void *)content_filters[kcunit - 1]) {
1431                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1432                         kcunit);
1433                 error = EINVAL;
1434                 goto done;
1435         }
1436         switch (opt) {
1437                 case CFIL_OPT_NECP_CONTROL_UNIT:
1438                         if (*len < sizeof(uint32_t)) {
1439                                 CFIL_LOG(LOG_ERR, "len too small %lu", *len);
1440                                 error = EINVAL;
1441                                 goto done;
1442                         }
1443                         if (data != NULL)
1444                                 *(uint32_t *)data = cfc->cf_necp_control_unit;
1445                         break;
1446                 default:
1447                         error = ENOPROTOOPT;
1448                         break;
1449         }
1450 done:
1451         cfil_rw_unlock_shared(&cfil_lck_rw);
1452
1453         return (error);
1454 }
1455
1456 static errno_t
1457 cfil_ctl_setopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1458                 int opt, void *data, size_t len)
1459 {
1460 #pragma unused(kctlref, opt)
1461         errno_t error = 0;
1462         struct content_filter *cfc = (struct content_filter *)unitinfo;
1463
1464         CFIL_LOG(LOG_NOTICE, "");
1465
1466         cfil_rw_lock_exclusive(&cfil_lck_rw);
1467
1468         if (content_filters == NULL) {
1469                 CFIL_LOG(LOG_ERR, "no content filter");
1470                 error = EINVAL;
1471                 goto done;
1472         }
1473         if (kcunit > MAX_CONTENT_FILTER) {
1474                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1475                         kcunit, MAX_CONTENT_FILTER);
1476                 error = EINVAL;
1477                 goto done;
1478         }
1479         if (cfc != (void *)content_filters[kcunit - 1]) {
1480                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1481                         kcunit);
1482                 error = EINVAL;
1483                 goto done;
1484         }
1485         switch (opt) {
1486                 case CFIL_OPT_NECP_CONTROL_UNIT:
1487                         if (len < sizeof(uint32_t)) {
1488                                 CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1489                                         "len too small %lu", len);
1490                                 error = EINVAL;
1491                                 goto done;
1492                         }
1493                         if (cfc->cf_necp_control_unit != 0) {
1494                                 CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1495                                         "already set %u",
1496                                         cfc->cf_necp_control_unit);
1497                                 error = EINVAL;
1498                                 goto done;
1499                         }
1500                         cfc->cf_necp_control_unit = *(uint32_t *)data;
1501                         break;
1502                 default:
1503                         error = ENOPROTOOPT;
1504                         break;
1505         }
1506 done:
1507         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1508
1509         return (error);
1510 }
1511
1512
1513 static void
1514 cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
1515 {
1516 #pragma unused(kctlref, flags)
1517         struct content_filter *cfc = (struct content_filter *)unitinfo;
1518         struct socket *so = NULL;
1519         int error;
1520         struct cfil_entry *entry;
1521
1522         CFIL_LOG(LOG_INFO, "");
1523
1524         if (content_filters == NULL) {
1525                 CFIL_LOG(LOG_ERR, "no content filter");
1526                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1527                 return;
1528         }
1529         if (kcunit > MAX_CONTENT_FILTER) {
1530                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1531                         kcunit, MAX_CONTENT_FILTER);
1532                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1533                 return;
1534         }
1535         cfil_rw_lock_shared(&cfil_lck_rw);
1536         if (cfc != (void *)content_filters[kcunit - 1]) {
1537                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1538                         kcunit);
1539                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1540                 goto done;
1541         }
1542         /* Let's assume the flow control is lifted */
1543         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
1544                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
1545                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1546
1547         cfc->cf_flags &= ~CFF_FLOW_CONTROLLED;
1548
1549                 cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
1550                 lck_rw_assert(&cfil_lck_rw, LCK_RW_ASSERT_SHARED);
1551         }
1552         /*
1553          * Flow control will be raised again as soon as an entry cannot enqueue
1554          * to the kernel control socket
1555          */
1556         while ((cfc->cf_flags & CFF_FLOW_CONTROLLED) == 0) {
1557                 verify_content_filter(cfc);
1558
1559                 cfil_rw_lock_assert_held(&cfil_lck_rw, 0);
1560
1561                 /* Find an entry that is flow controlled */
1562                 TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
1563                         if (entry->cfe_cfil_info == NULL ||
1564                                 entry->cfe_cfil_info->cfi_so == NULL)
1565                                 continue;
1566                         if ((entry->cfe_flags & CFEF_FLOW_CONTROLLED) == 0)
1567                                 continue;
1568                 }
1569                 if (entry == NULL)
1570                         break;
1571
1572                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
1573
1574                 so = entry->cfe_cfil_info->cfi_so;
1575
1576                 cfil_rw_unlock_shared(&cfil_lck_rw);
1577                 socket_lock(so, 1);
1578
1579                 do {
1580                         error = cfil_acquire_sockbuf(so, 1);
1581                         if (error == 0)
1582                                 error = cfil_data_service_ctl_q(so, kcunit, 1);
1583                         cfil_release_sockbuf(so, 1);
1584                         if (error != 0)
1585                                 break;
1586
1587                         error = cfil_acquire_sockbuf(so, 0);
1588                         if (error == 0)
1589                                 error = cfil_data_service_ctl_q(so, kcunit, 0);
1590                         cfil_release_sockbuf(so, 0);
1591                 } while (0);
1592
1593                 socket_lock_assert_owned(so);
1594                 socket_unlock(so, 1);
1595
1596                 cfil_rw_lock_shared(&cfil_lck_rw);
1597         }
1598 done:
1599         cfil_rw_unlock_shared(&cfil_lck_rw);
1600 }
1601
1602 void
1603 cfil_init(void)
1604 {
1605         struct kern_ctl_reg kern_ctl;
1606         errno_t error = 0;
1607         vm_size_t content_filter_size = 0;      /* size of content_filter */
1608         vm_size_t cfil_info_size = 0;   /* size of cfil_info */
1609
1610         CFIL_LOG(LOG_NOTICE, "");
1611
1612         /*
1613          * Compile time verifications
1614          */
1615         _CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER);
1616         _CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0);
1617         _CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0);
1618         _CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0);
1619
1620         /*
1621          * Runtime time verifications
1622          */
1623         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_enqueued,
1624                 sizeof(uint32_t)));
1625         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_enqueued,
1626                 sizeof(uint32_t)));
1627         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_peeked,
1628                 sizeof(uint32_t)));
1629         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_peeked,
1630                 sizeof(uint32_t)));
1631
1632         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_in_enqueued,
1633                 sizeof(uint32_t)));
1634         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_out_enqueued,
1635                 sizeof(uint32_t)));
1636
1637         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_enqueued,
1638                 sizeof(uint32_t)));
1639         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_enqueued,
1640                 sizeof(uint32_t)));
1641         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_passed,
1642                 sizeof(uint32_t)));
1643         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_passed,
1644                 sizeof(uint32_t)));
1645
1646         /*
1647          * Zone for content filters kernel control sockets
1648          */
1649         content_filter_size = sizeof(struct content_filter);
1650         content_filter_zone = zinit(content_filter_size,
1651                                 CONTENT_FILTER_ZONE_MAX * content_filter_size,
1652                                 0,
1653                                 CONTENT_FILTER_ZONE_NAME);
1654         if (content_filter_zone == NULL) {
1655                 panic("%s: zinit(%s) failed", __func__,
1656                         CONTENT_FILTER_ZONE_NAME);
1657                 /* NOTREACHED */
1658         }
1659         zone_change(content_filter_zone, Z_CALLERACCT, FALSE);
1660         zone_change(content_filter_zone, Z_EXPAND, TRUE);
1661
1662         /*
1663          * Zone for per socket content filters
1664          */
1665         cfil_info_size = sizeof(struct cfil_info);
1666         cfil_info_zone = zinit(cfil_info_size,
1667                                 CFIL_INFO_ZONE_MAX * cfil_info_size,
1668                                 0,
1669                                 CFIL_INFO_ZONE_NAME);
1670         if (cfil_info_zone == NULL) {
1671                 panic("%s: zinit(%s) failed", __func__, CFIL_INFO_ZONE_NAME);
1672                 /* NOTREACHED */
1673         }
1674         zone_change(cfil_info_zone, Z_CALLERACCT, FALSE);
1675         zone_change(cfil_info_zone, Z_EXPAND, TRUE);
1676
1677         /*
1678          * Allocate locks
1679          */
1680         cfil_lck_grp_attr = lck_grp_attr_alloc_init();
1681         if (cfil_lck_grp_attr == NULL) {
1682                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1683                 /* NOTREACHED */
1684         }
1685         cfil_lck_grp = lck_grp_alloc_init("content filter",
1686                                         cfil_lck_grp_attr);
1687         if (cfil_lck_grp == NULL) {
1688                 panic("%s: lck_grp_alloc_init failed", __func__);
1689                 /* NOTREACHED */
1690         }
1691         cfil_lck_attr = lck_attr_alloc_init();
1692         if (cfil_lck_attr == NULL) {
1693                 panic("%s: lck_attr_alloc_init failed", __func__);
1694                 /* NOTREACHED */
1695         }
1696         lck_rw_init(&cfil_lck_rw, cfil_lck_grp, cfil_lck_attr);
1697
1698         TAILQ_INIT(&cfil_sock_head);
1699
1700         /*
1701          * Register kernel control
1702          */
1703         bzero(&kern_ctl, sizeof(kern_ctl));
1704         strlcpy(kern_ctl.ctl_name, CONTENT_FILTER_CONTROL_NAME,
1705                 sizeof(kern_ctl.ctl_name));
1706         kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_EXTENDED;
1707         kern_ctl.ctl_sendsize = 512 * 1024; /* enough? */
1708         kern_ctl.ctl_recvsize = 512 * 1024; /* enough? */
1709         kern_ctl.ctl_connect = cfil_ctl_connect;
1710         kern_ctl.ctl_disconnect = cfil_ctl_disconnect;
1711         kern_ctl.ctl_send = cfil_ctl_send;
1712         kern_ctl.ctl_getopt = cfil_ctl_getopt;
1713         kern_ctl.ctl_setopt = cfil_ctl_setopt;
1714         kern_ctl.ctl_rcvd = cfil_ctl_rcvd;
1715         error = ctl_register(&kern_ctl, &cfil_kctlref);
1716         if (error != 0) {
1717                 CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
1718                 return;
1719         }
1720 }
1721
1722 struct cfil_info *
1723 cfil_info_alloc(struct socket *so)
1724 {
1725         int kcunit;
1726         struct cfil_info *cfil_info = NULL;
1727         struct inpcb *inp = sotoinpcb(so);
1728
1729         CFIL_LOG(LOG_INFO, "");
1730
1731         socket_lock_assert_owned(so);
1732
1733         cfil_info = zalloc(cfil_info_zone);
1734         if (cfil_info == NULL)
1735                 goto done;
1736         bzero(cfil_info, sizeof(struct cfil_info));
1737
1738         cfil_queue_init(&cfil_info->cfi_snd.cfi_inject_q);
1739         cfil_queue_init(&cfil_info->cfi_rcv.cfi_inject_q);
1740
1741         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1742                 struct cfil_entry *entry;
1743
1744                 entry = &cfil_info->cfi_entries[kcunit - 1];
1745                 entry->cfe_cfil_info = cfil_info;
1746
1747                 /* Initialize the filter entry */
1748                 entry->cfe_filter = NULL;
1749                 entry->cfe_flags = 0;
1750                 entry->cfe_necp_control_unit = 0;
1751                 entry->cfe_snd.cfe_pass_offset = 0;
1752                 entry->cfe_snd.cfe_peek_offset = 0;
1753                 entry->cfe_snd.cfe_peeked = 0;
1754                 entry->cfe_rcv.cfe_pass_offset = 0;
1755                 entry->cfe_rcv.cfe_peek_offset = 0;
1756                 entry->cfe_rcv.cfe_peeked = 0;
1757
1758                 cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
1759                 cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
1760                 cfil_queue_init(&entry->cfe_snd.cfe_ctl_q);
1761                 cfil_queue_init(&entry->cfe_rcv.cfe_ctl_q);
1762         }
1763
1764         cfil_rw_lock_exclusive(&cfil_lck_rw);
1765
1766         so->so_cfil = cfil_info;
1767         cfil_info->cfi_so = so;
1768         /*
1769          * Create a cfi_sock_id that's not the socket pointer!
1770          */
1771         if (inp->inp_flowhash == 0)
1772                 inp->inp_flowhash = inp_calc_flowhash(inp);
1773         cfil_info->cfi_sock_id =
1774                 ((so->so_gencnt << 32) | inp->inp_flowhash);
1775
1776         TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
1777
1778         cfil_sock_attached_count++;
1779
1780         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1781
1782 done:
1783         if (cfil_info != NULL)
1784                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_ok);
1785         else
1786                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_fail);
1787
1788         return (cfil_info);
1789 }
1790
1791 int
1792 cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit)
1793 {
1794         int kcunit;
1795         struct cfil_info *cfil_info = so->so_cfil;
1796         int attached = 0;
1797
1798         CFIL_LOG(LOG_INFO, "");
1799
1800         socket_lock_assert_owned(so);
1801
1802         cfil_rw_lock_exclusive(&cfil_lck_rw);
1803
1804         for (kcunit = 1;
1805                 content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1806                 kcunit++) {
1807                 struct content_filter *cfc = content_filters[kcunit - 1];
1808                 struct cfil_entry *entry;
1809
1810                 if (cfc == NULL)
1811                         continue;
1812                 if (cfc->cf_necp_control_unit != filter_control_unit)
1813                         continue;
1814
1815                 entry = &cfil_info->cfi_entries[kcunit - 1];
1816
1817                 entry->cfe_filter = cfc;
1818                 entry->cfe_necp_control_unit = filter_control_unit;
1819                 TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
1820                 cfc->cf_sock_count++;
1821                 verify_content_filter(cfc);
1822                 attached = 1;
1823                 entry->cfe_flags |= CFEF_CFIL_ATTACHED;
1824                 break;
1825         }
1826
1827         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1828
1829         return (attached);
1830 }
1831
1832 static void
1833 cfil_info_free(struct socket *so, struct cfil_info *cfil_info)
1834 {
1835         int kcunit;
1836         uint64_t in_drain = 0;
1837         uint64_t out_drained = 0;
1838
1839         so->so_cfil = NULL;
1840
1841         if (so->so_flags & SOF_CONTENT_FILTER) {
1842                 so->so_flags &= ~SOF_CONTENT_FILTER;
1843                 VERIFY(so->so_usecount > 0);
1844                 so->so_usecount--;
1845         }
1846         if (cfil_info == NULL)
1847                 return;
1848
1849         CFIL_LOG(LOG_INFO, "");
1850
1851         cfil_rw_lock_exclusive(&cfil_lck_rw);
1852
1853         for (kcunit = 1;
1854                 content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1855                 kcunit++) {
1856                 struct cfil_entry *entry;
1857                 struct content_filter *cfc;
1858
1859                 entry = &cfil_info->cfi_entries[kcunit - 1];
1860
1861                 /* Don't be silly and try to detach twice */
1862                 if (entry->cfe_filter == NULL)
1863                         continue;
1864
1865                 cfc = content_filters[kcunit - 1];
1866
1867                 VERIFY(cfc == entry->cfe_filter);
1868
1869                 entry->cfe_filter = NULL;
1870                 entry->cfe_necp_control_unit = 0;
1871                 TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1872                 cfc->cf_sock_count--;
1873
1874                 verify_content_filter(cfc);
1875         }
1876         cfil_sock_attached_count--;
1877         TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
1878
1879         out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
1880         in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
1881
1882         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1883                 struct cfil_entry *entry;
1884
1885                 entry = &cfil_info->cfi_entries[kcunit - 1];
1886                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
1887                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_pending_q);
1888                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
1889                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_ctl_q);
1890         }
1891         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1892
1893         if (out_drained)
1894                 OSIncrementAtomic(&cfil_stats.cfs_flush_out_free);
1895         if (in_drain)
1896                 OSIncrementAtomic(&cfil_stats.cfs_flush_in_free);
1897
1898         zfree(cfil_info_zone, cfil_info);
1899 }
1900
1901 /*
1902  * Entry point from Sockets layer
1903  * The socket is locked.
1904  */
1905 errno_t
1906 cfil_sock_attach(struct socket *so)
1907 {
1908         errno_t error = 0;
1909         uint32_t filter_control_unit;
1910
1911         socket_lock_assert_owned(so);
1912
1913         /* Limit ourselves to TCP that are not MPTCP subflows */
1914         if ((so->so_proto->pr_domain->dom_family != PF_INET &&
1915                 so->so_proto->pr_domain->dom_family != PF_INET6) ||
1916                 so->so_proto->pr_type != SOCK_STREAM ||
1917                 so->so_proto->pr_protocol != IPPROTO_TCP ||
1918                 (so->so_flags & SOF_MP_SUBFLOW) != 0)
1919                 goto done;
1920
1921         filter_control_unit = necp_socket_get_content_filter_control_unit(so);
1922         if (filter_control_unit == 0)
1923                 goto done;
1924
1925         if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
1926                 OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
1927                 goto done;
1928         }
1929         if (cfil_active_count == 0) {
1930                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
1931                 goto done;
1932         }
1933         if (so->so_cfil != NULL) {
1934                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
1935                 CFIL_LOG(LOG_ERR, "already attached");
1936         } else {
1937                 cfil_info_alloc(so);
1938                 if (so->so_cfil == NULL) {
1939                         error = ENOMEM;
1940                         OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
1941                         goto done;
1942                 }
1943         }
1944         if (cfil_info_attach_unit(so, filter_control_unit) == 0) {
1945                 CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
1946                         filter_control_unit);
1947                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
1948                 goto done;
1949         }
1950         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockid %llx",
1951                 (uint64_t)VM_KERNEL_ADDRPERM(so),
1952                 filter_control_unit, so->so_cfil->cfi_sock_id);
1953
1954         so->so_flags |= SOF_CONTENT_FILTER;
1955         OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
1956
1957         /* Hold a reference on the socket */
1958         so->so_usecount++;
1959
1960         error = cfil_dispatch_attach_event(so, filter_control_unit);
1961         /* We can recover from flow control or out of memory errors */
1962         if (error == ENOBUFS || error == ENOMEM)
1963                 error = 0;
1964         else if (error != 0)
1965                 goto done;
1966
1967         CFIL_INFO_VERIFY(so->so_cfil);
1968 done:
1969         return (error);
1970 }
1971
1972 /*
1973  * Entry point from Sockets layer
1974  * The socket is locked.
1975  */
1976 errno_t
1977 cfil_sock_detach(struct socket *so)
1978 {
1979         if (so->so_cfil) {
1980                 cfil_info_free(so, so->so_cfil);
1981                 OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
1982         }
1983         return (0);
1984 }
1985
1986 static int
1987 cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
1988 {
1989         errno_t error = 0;
1990         struct cfil_entry *entry = NULL;
1991         struct cfil_msg_sock_attached msg_attached;
1992         uint32_t kcunit;
1993         struct content_filter *cfc;
1994
1995         socket_lock_assert_owned(so);
1996
1997         cfil_rw_lock_shared(&cfil_lck_rw);
1998
1999         if (so->so_proto == NULL || so->so_proto->pr_domain == NULL) {
2000                 error = EINVAL;
2001                 goto done;
2002         }
2003         /*
2004          * Find the matching filter unit
2005          */
2006         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2007                 cfc = content_filters[kcunit - 1];
2008
2009                 if (cfc == NULL)
2010                         continue;
2011                 if (cfc->cf_necp_control_unit != filter_control_unit)
2012                         continue;
2013                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
2014                 if (entry->cfe_filter == NULL)
2015                         continue;
2016
2017                 VERIFY(cfc == entry->cfe_filter);
2018
2019                 break;
2020         }
2021
2022         if (entry == NULL || entry->cfe_filter == NULL)
2023                 goto done;
2024
2025         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED))
2026                 goto done;
2027
2028         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
2029                 (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit);
2030
2031         /* Would be wasteful to try when flow controlled */
2032         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2033                 error = ENOBUFS;
2034                 goto done;
2035         }
2036
2037         bzero(&msg_attached, sizeof(struct cfil_msg_sock_attached));
2038         msg_attached.cfs_msghdr.cfm_len = sizeof(struct cfil_msg_sock_attached);
2039         msg_attached.cfs_msghdr.cfm_version = CFM_VERSION_CURRENT;
2040         msg_attached.cfs_msghdr.cfm_type = CFM_TYPE_EVENT;
2041         msg_attached.cfs_msghdr.cfm_op = CFM_OP_SOCKET_ATTACHED;
2042         msg_attached.cfs_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2043
2044         msg_attached.cfs_sock_family = so->so_proto->pr_domain->dom_family;
2045         msg_attached.cfs_sock_type = so->so_proto->pr_type;
2046         msg_attached.cfs_sock_protocol = so->so_proto->pr_protocol;
2047         msg_attached.cfs_pid = so->last_pid;
2048         memcpy(msg_attached.cfs_uuid, so->last_uuid, sizeof(uuid_t));
2049         if (so->so_flags & SOF_DELEGATED) {
2050                 msg_attached.cfs_e_pid = so->e_pid;
2051                 memcpy(msg_attached.cfs_e_uuid, so->e_uuid, sizeof(uuid_t));
2052         } else {
2053                 msg_attached.cfs_e_pid = so->last_pid;
2054                 memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
2055         }
2056         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2057                                 entry->cfe_filter->cf_kcunit,
2058                                 &msg_attached,
2059                                 sizeof(struct cfil_msg_sock_attached),
2060                                 CTL_DATA_EOR);
2061         if (error != 0) {
2062                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error);
2063                 goto done;
2064         }
2065         microuptime(&entry->cfe_last_event);
2066         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
2067         OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
2068 done:
2069
2070         /* We can recover from flow control */
2071         if (error == ENOBUFS) {
2072                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2073                 OSIncrementAtomic(&cfil_stats.cfs_attach_event_flow_control);
2074
2075                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2076                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2077
2078                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2079
2080                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2081         } else {
2082                 if (error != 0)
2083                         OSIncrementAtomic(&cfil_stats.cfs_attach_event_fail);
2084
2085                 cfil_rw_unlock_shared(&cfil_lck_rw);
2086         }
2087         return (error);
2088 }
2089
2090 static int
2091 cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
2092 {
2093         errno_t error = 0;
2094         struct mbuf *msg = NULL;
2095         struct cfil_entry *entry;
2096         struct cfe_buf *entrybuf;
2097         struct cfil_msg_hdr msg_disconnected;
2098         struct content_filter *cfc;
2099
2100         socket_lock_assert_owned(so);
2101
2102         cfil_rw_lock_shared(&cfil_lck_rw);
2103
2104         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2105         if (outgoing)
2106                 entrybuf = &entry->cfe_snd;
2107         else
2108                 entrybuf = &entry->cfe_rcv;
2109
2110         cfc = entry->cfe_filter;
2111         if (cfc == NULL)
2112                 goto done;
2113
2114         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2115                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2116
2117         /*
2118          * Send the disconnection event once
2119          */
2120         if ((outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) ||
2121                 (!outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))) {
2122                 CFIL_LOG(LOG_INFO, "so %llx disconnect already sent",
2123                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2124                 goto done;
2125         }
2126
2127         /*
2128          * We're not disconnected as long as some data is waiting
2129          * to be delivered to the filter
2130          */
2131         if (outgoing && cfil_queue_empty(&entrybuf->cfe_ctl_q) == 0) {
2132                 CFIL_LOG(LOG_INFO, "so %llx control queue not empty",
2133                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2134                 error = EBUSY;
2135                 goto done;
2136         }
2137         /* Would be wasteful to try when flow controlled */
2138         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2139                 error = ENOBUFS;
2140                 goto done;
2141         }
2142
2143         bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
2144         msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
2145         msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
2146         msg_disconnected.cfm_type = CFM_TYPE_EVENT;
2147         msg_disconnected.cfm_op = outgoing ? CFM_OP_DISCONNECT_OUT :
2148                 CFM_OP_DISCONNECT_IN;
2149         msg_disconnected.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2150         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2151                                 entry->cfe_filter->cf_kcunit,
2152                                 &msg_disconnected,
2153                                 sizeof(struct cfil_msg_hdr),
2154                                 CTL_DATA_EOR);
2155         if (error != 0) {
2156                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2157                 mbuf_freem(msg);
2158                 goto done;
2159         }
2160         microuptime(&entry->cfe_last_event);
2161
2162         /* Remember we have sent the disconnection message */
2163         if (outgoing) {
2164                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_OUT;
2165                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_out_event_ok);
2166         } else {
2167                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_IN;
2168                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_in_event_ok);
2169         }
2170 done:
2171         if (error == ENOBUFS) {
2172                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2173                 OSIncrementAtomic(
2174                         &cfil_stats.cfs_disconnect_event_flow_control);
2175
2176                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2177                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2178
2179                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2180
2181                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2182         } else {
2183                 if (error != 0)
2184                         OSIncrementAtomic(
2185                                 &cfil_stats.cfs_disconnect_event_fail);
2186
2187                 cfil_rw_unlock_shared(&cfil_lck_rw);
2188         }
2189         return (error);
2190 }
2191
2192 int
2193 cfil_dispatch_closed_event(struct socket *so, int kcunit)
2194 {
2195         struct cfil_entry *entry;
2196         struct cfil_msg_hdr msg_closed;
2197         errno_t error = 0;
2198         struct content_filter *cfc;
2199
2200         socket_lock_assert_owned(so);
2201
2202         cfil_rw_lock_shared(&cfil_lck_rw);
2203
2204         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2205         cfc = entry->cfe_filter;
2206         if (cfc == NULL)
2207                 goto done;
2208
2209         CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
2210                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
2211
2212         /* Would be wasteful to try when flow controlled */
2213         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2214                 error = ENOBUFS;
2215                 goto done;
2216         }
2217         /*
2218          * Send a single closed message per filter
2219          */
2220         if ((entry->cfe_flags & CFEF_SENT_SOCK_CLOSED) != 0)
2221                 goto done;
2222         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
2223                 goto done;
2224
2225         bzero(&msg_closed, sizeof(struct cfil_msg_hdr));
2226         msg_closed.cfm_len = sizeof(struct cfil_msg_hdr);
2227         msg_closed.cfm_version = CFM_VERSION_CURRENT;
2228         msg_closed.cfm_type = CFM_TYPE_EVENT;
2229         msg_closed.cfm_op = CFM_OP_SOCKET_CLOSED;
2230         msg_closed.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2231         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2232                                 entry->cfe_filter->cf_kcunit,
2233                                 &msg_closed,
2234                                 sizeof(struct cfil_msg_hdr),
2235                                 CTL_DATA_EOR);
2236         if (error != 0) {
2237                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d",
2238                         error);
2239                 goto done;
2240         }
2241         microuptime(&entry->cfe_last_event);
2242         entry->cfe_flags |= CFEF_SENT_SOCK_CLOSED;
2243         OSIncrementAtomic(&cfil_stats.cfs_closed_event_ok);
2244 done:
2245         /* We can recover from flow control */
2246         if (error == ENOBUFS) {
2247                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2248                 OSIncrementAtomic(&cfil_stats.cfs_closed_event_flow_control);
2249
2250                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2251                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2252
2253                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2254
2255                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2256         } else {
2257                 if (error != 0)
2258                         OSIncrementAtomic(&cfil_stats.cfs_closed_event_fail);
2259
2260                 cfil_rw_unlock_shared(&cfil_lck_rw);
2261         }
2262
2263         return (error);
2264 }
2265
2266 static void
2267 fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2268         struct in6_addr *ip6, u_int16_t port)
2269 {
2270         struct sockaddr_in6 *sin6 = &sin46->sin6;
2271
2272         sin6->sin6_family = AF_INET6;
2273         sin6->sin6_len = sizeof(*sin6);
2274         sin6->sin6_port = port;
2275         sin6->sin6_addr = *ip6;
2276         if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) {
2277                 sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]);
2278                 sin6->sin6_addr.s6_addr16[1] = 0;
2279         }
2280 }
2281
2282 static void
2283 fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2284         struct in_addr ip, u_int16_t port)
2285 {
2286         struct sockaddr_in *sin = &sin46->sin;
2287
2288         sin->sin_family = AF_INET;
2289         sin->sin_len = sizeof(*sin);
2290         sin->sin_port = port;
2291         sin->sin_addr.s_addr = ip.s_addr;
2292 }
2293
2294 static int
2295 cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
2296         struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
2297 {
2298         errno_t error = 0;
2299         struct mbuf *copy = NULL;
2300         struct mbuf *msg = NULL;
2301         unsigned int one = 1;
2302         struct cfil_msg_data_event *data_req;
2303         size_t hdrsize;
2304         struct inpcb *inp = (struct inpcb *)so->so_pcb;
2305         struct cfil_entry *entry;
2306         struct cfe_buf *entrybuf;
2307         struct content_filter *cfc;
2308
2309         cfil_rw_lock_shared(&cfil_lck_rw);
2310
2311         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2312         if (outgoing)
2313                 entrybuf = &entry->cfe_snd;
2314         else
2315                 entrybuf = &entry->cfe_rcv;
2316
2317         cfc = entry->cfe_filter;
2318         if (cfc == NULL)
2319                 goto done;
2320
2321         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2322                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2323
2324         socket_lock_assert_owned(so);
2325
2326         /* Would be wasteful to try */
2327         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2328                 error = ENOBUFS;
2329                 goto done;
2330         }
2331
2332         /* Make a copy of the data to pass to kernel control socket */
2333         copy = m_copym_mode(data, copyoffset, copylen, M_DONTWAIT,
2334                 M_COPYM_NOOP_HDR);
2335         if (copy == NULL) {
2336                 CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2337                 error = ENOMEM;
2338                 goto done;
2339         }
2340
2341         /* We need an mbuf packet for the message header */
2342         hdrsize = sizeof(struct cfil_msg_data_event);
2343         error = mbuf_allocpacket(MBUF_DONTWAIT, hdrsize, &one, &msg);
2344         if (error != 0) {
2345                 CFIL_LOG(LOG_ERR, "mbuf_allocpacket() failed");
2346                 m_freem(copy);
2347                 /*
2348                  * ENOBUFS is to indicate flow control
2349                  */
2350                 error = ENOMEM;
2351                 goto done;
2352         }
2353         mbuf_setlen(msg, hdrsize);
2354         mbuf_pkthdr_setlen(msg, hdrsize + copylen);
2355         msg->m_next = copy;
2356         data_req = (struct cfil_msg_data_event *)mbuf_data(msg);
2357         bzero(data_req, hdrsize);
2358         data_req->cfd_msghdr.cfm_len = hdrsize + copylen;
2359         data_req->cfd_msghdr.cfm_version = 1;
2360         data_req->cfd_msghdr.cfm_type = CFM_TYPE_EVENT;
2361         data_req->cfd_msghdr.cfm_op =
2362                 outgoing ? CFM_OP_DATA_OUT : CFM_OP_DATA_IN;
2363         data_req->cfd_msghdr.cfm_sock_id =
2364                 entry->cfe_cfil_info->cfi_sock_id;
2365         data_req->cfd_start_offset = entrybuf->cfe_peeked;
2366         data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
2367
2368         /*
2369          * TBD:
2370          * For non connected sockets need to copy addresses from passed
2371          * parameters
2372          */
2373         if (inp->inp_vflag & INP_IPV6) {
2374                 if (outgoing) {
2375                         fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2376                                 &inp->in6p_laddr, inp->inp_lport);
2377                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2378                                 &inp->in6p_faddr, inp->inp_fport);
2379                 } else {
2380                         fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2381                                 &inp->in6p_faddr, inp->inp_fport);
2382                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2383                                 &inp->in6p_laddr, inp->inp_lport);
2384                 }
2385         } else if (inp->inp_vflag & INP_IPV4) {
2386                 if (outgoing) {
2387                         fill_ip_sockaddr_4_6(&data_req->cfc_src,
2388                                 inp->inp_laddr, inp->inp_lport);
2389                         fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2390                                 inp->inp_faddr, inp->inp_fport);
2391                 } else {
2392                         fill_ip_sockaddr_4_6(&data_req->cfc_src,
2393                                 inp->inp_faddr, inp->inp_fport);
2394                         fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2395                                 inp->inp_laddr, inp->inp_lport);
2396                 }
2397         }
2398
2399         /* Pass the message to the content filter */
2400         error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
2401                                 entry->cfe_filter->cf_kcunit,
2402                                 msg, CTL_DATA_EOR);
2403         if (error != 0) {
2404                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2405                 mbuf_freem(msg);
2406                 goto done;
2407         }
2408         entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
2409         OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
2410 done:
2411         if (error == ENOBUFS) {
2412                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2413                 OSIncrementAtomic(
2414                         &cfil_stats.cfs_data_event_flow_control);
2415
2416                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2417                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2418
2419                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2420
2421                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2422         } else {
2423                 if (error != 0)
2424                         OSIncrementAtomic(&cfil_stats.cfs_data_event_fail);
2425
2426                 cfil_rw_unlock_shared(&cfil_lck_rw);
2427         }
2428         return (error);
2429 }
2430
2431 /*
2432  * Process the queue of data waiting to be delivered to content filter
2433  */
2434 static int
2435 cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
2436 {
2437         errno_t error = 0;
2438         struct mbuf *data, *tmp = NULL;
2439         unsigned int datalen = 0, copylen = 0, copyoffset = 0;
2440         struct cfil_entry *entry;
2441         struct cfe_buf *entrybuf;
2442         uint64_t currentoffset = 0;
2443
2444         if (so->so_cfil == NULL)
2445                 return (0);
2446
2447         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2448                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2449
2450         socket_lock_assert_owned(so);
2451
2452         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2453         if (outgoing)
2454                 entrybuf = &entry->cfe_snd;
2455         else
2456                 entrybuf = &entry->cfe_rcv;
2457
2458         /* Send attached message if not yet done */
2459         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
2460                 error = cfil_dispatch_attach_event(so, kcunit);
2461                 if (error != 0) {
2462                         /* We can recover from flow control */
2463                         if (error == ENOBUFS || error == ENOMEM)
2464                                 error = 0;
2465                         goto done;
2466                 }
2467         } else if ((entry->cfe_flags & CFEF_DATA_START) == 0) {
2468                 OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
2469                 goto done;
2470         }
2471         CFIL_LOG(LOG_DEBUG, "pass_offset %llu peeked %llu peek_offset %llu",
2472                 entrybuf->cfe_pass_offset,
2473                 entrybuf->cfe_peeked,
2474                 entrybuf->cfe_peek_offset);
2475
2476         /* Move all data that can pass */
2477         while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
2478                 entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
2479                 datalen = cfil_data_length(data, NULL);
2480                 tmp = data;
2481
2482                 if (entrybuf->cfe_ctl_q.q_start + datalen <=
2483                         entrybuf->cfe_pass_offset) {
2484                         /*
2485                          * The first mbuf can fully pass
2486                          */
2487                         copylen = datalen;
2488                 } else {
2489                         /*
2490                          * The first mbuf can partially pass
2491                          */
2492                         copylen = entrybuf->cfe_pass_offset -
2493                                 entrybuf->cfe_ctl_q.q_start;
2494                 }
2495                 VERIFY(copylen <= datalen);
2496
2497                 CFIL_LOG(LOG_DEBUG,
2498                         "%llx first %llu peeked %llu pass %llu peek %llu"
2499                         "datalen %u copylen %u",
2500                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2501                         entrybuf->cfe_ctl_q.q_start,
2502                         entrybuf->cfe_peeked,
2503                         entrybuf->cfe_pass_offset,
2504                         entrybuf->cfe_peek_offset,
2505                         datalen, copylen);
2506
2507                 /*
2508                  * Data that passes has been peeked at explicitly or
2509                  * implicitly
2510                  */
2511                 if (entrybuf->cfe_ctl_q.q_start + copylen >
2512                         entrybuf->cfe_peeked)
2513                         entrybuf->cfe_peeked =
2514                                 entrybuf->cfe_ctl_q.q_start + copylen;
2515                 /*
2516                  * Stop on partial pass
2517                  */
2518                 if (copylen < datalen)
2519                         break;
2520
2521                 /* All good, move full data from ctl queue to pending queue */
2522                 cfil_queue_remove(&entrybuf->cfe_ctl_q, data, datalen);
2523
2524                 cfil_queue_enqueue(&entrybuf->cfe_pending_q, data, datalen);
2525                 if (outgoing)
2526                         OSAddAtomic64(datalen,
2527                                 &cfil_stats.cfs_pending_q_out_enqueued);
2528                 else
2529                         OSAddAtomic64(datalen,
2530                                 &cfil_stats.cfs_pending_q_in_enqueued);
2531         }
2532         CFIL_INFO_VERIFY(so->so_cfil);
2533         if (tmp != NULL)
2534                 CFIL_LOG(LOG_DEBUG,
2535                         "%llx first %llu peeked %llu pass %llu peek %llu"
2536                         "datalen %u copylen %u",
2537                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2538                         entrybuf->cfe_ctl_q.q_start,
2539                         entrybuf->cfe_peeked,
2540                         entrybuf->cfe_pass_offset,
2541                         entrybuf->cfe_peek_offset,
2542                         datalen, copylen);
2543         tmp = NULL;
2544
2545         /* Now deal with remaining data the filter wants to peek at */
2546         for (data = cfil_queue_first(&entrybuf->cfe_ctl_q),
2547                 currentoffset = entrybuf->cfe_ctl_q.q_start;
2548                 data != NULL && currentoffset < entrybuf->cfe_peek_offset;
2549                 data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
2550                 currentoffset += datalen) {
2551                 datalen = cfil_data_length(data, NULL);
2552                 tmp = data;
2553
2554                 /* We've already peeked at this mbuf */
2555                 if (currentoffset + datalen <= entrybuf->cfe_peeked)
2556                         continue;
2557                 /*
2558                  * The data in the first mbuf may have been
2559                  * partially peeked at
2560                  */
2561                 copyoffset = entrybuf->cfe_peeked - currentoffset;
2562                 VERIFY(copyoffset < datalen);
2563                 copylen = datalen - copyoffset;
2564                 VERIFY(copylen <= datalen);
2565                 /*
2566                  * Do not copy more than needed
2567                  */
2568                 if (currentoffset + copyoffset + copylen >
2569                         entrybuf->cfe_peek_offset) {
2570                         copylen = entrybuf->cfe_peek_offset -
2571                                 (currentoffset + copyoffset);
2572                 }
2573
2574                 CFIL_LOG(LOG_DEBUG,
2575                         "%llx current %llu peeked %llu pass %llu peek %llu"
2576                         "datalen %u copylen %u copyoffset %u",
2577                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2578                         currentoffset,
2579                         entrybuf->cfe_peeked,
2580                         entrybuf->cfe_pass_offset,
2581                         entrybuf->cfe_peek_offset,
2582                         datalen, copylen, copyoffset);
2583
2584                 /*
2585                  * Stop if there is nothing more to peek at
2586                  */
2587                 if (copylen == 0)
2588                         break;
2589                 /*
2590                  * Let the filter get a peek at this span of data
2591                  */
2592                 error = cfil_dispatch_data_event(so, kcunit,
2593                         outgoing, data, copyoffset, copylen);
2594                 if (error != 0) {
2595                         /* On error, leave data in ctl_q */
2596                         break;
2597                 }
2598                 entrybuf->cfe_peeked += copylen;
2599                 if (outgoing)
2600                         OSAddAtomic64(copylen,
2601                                 &cfil_stats.cfs_ctl_q_out_peeked);
2602                 else
2603                         OSAddAtomic64(copylen,
2604                                 &cfil_stats.cfs_ctl_q_in_peeked);
2605
2606                 /* Stop when data could not be fully peeked at */
2607                 if (copylen + copyoffset < datalen)
2608                         break;
2609         }
2610         CFIL_INFO_VERIFY(so->so_cfil);
2611         if (tmp != NULL)
2612                 CFIL_LOG(LOG_DEBUG,
2613                         "%llx first %llu peeked %llu pass %llu peek %llu"
2614                         "datalen %u copylen %u copyoffset %u",
2615                         (uint64_t)VM_KERNEL_ADDRPERM(tmp),
2616                         currentoffset,
2617                         entrybuf->cfe_peeked,
2618                         entrybuf->cfe_pass_offset,
2619                         entrybuf->cfe_peek_offset,
2620                         datalen, copylen, copyoffset);
2621
2622         /*
2623          * Process data that has passed the filter
2624          */
2625         error = cfil_service_pending_queue(so, kcunit, outgoing);
2626         if (error != 0) {
2627                 CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
2628                         error);
2629                 goto done;
2630         }
2631
2632         /*
2633          * Dispatch disconnect events that could not be sent
2634          */
2635         if (so->so_cfil == NULL)
2636                 goto done;
2637         else if (outgoing) {
2638                 if ((so->so_cfil->cfi_flags & CFIF_SHUT_WR) &&
2639                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT))
2640                         cfil_dispatch_disconnect_event(so, kcunit, 1);
2641         } else {
2642                 if ((so->so_cfil->cfi_flags & CFIF_SHUT_RD) &&
2643                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))
2644                         cfil_dispatch_disconnect_event(so, kcunit, 0);
2645         }
2646
2647 done:
2648         CFIL_LOG(LOG_DEBUG,
2649                 "first %llu peeked %llu pass %llu peek %llu",
2650                 entrybuf->cfe_ctl_q.q_start,
2651                 entrybuf->cfe_peeked,
2652                 entrybuf->cfe_pass_offset,
2653                 entrybuf->cfe_peek_offset);
2654
2655         CFIL_INFO_VERIFY(so->so_cfil);
2656         return (error);
2657 }
2658
2659 /*
2660  * cfil_data_filter()
2661  *
2662  * Process data for a content filter installed on a socket
2663  */
2664 int
2665 cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
2666         struct mbuf *data, uint64_t datalen)
2667 {
2668         errno_t error = 0;
2669         struct cfil_entry *entry;
2670         struct cfe_buf *entrybuf;
2671
2672         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2673                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2674
2675         socket_lock_assert_owned(so);
2676
2677         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2678         if (outgoing)
2679                 entrybuf = &entry->cfe_snd;
2680         else
2681                 entrybuf = &entry->cfe_rcv;
2682
2683         /* Are we attached to the filter? */
2684         if (entry->cfe_filter == NULL) {
2685                 error = 0;
2686                 goto done;
2687         }
2688
2689         /* Dispatch to filters */
2690         cfil_queue_enqueue(&entrybuf->cfe_ctl_q, data, datalen);
2691         if (outgoing)
2692                 OSAddAtomic64(datalen,
2693                         &cfil_stats.cfs_ctl_q_out_enqueued);
2694         else
2695                 OSAddAtomic64(datalen,
2696                         &cfil_stats.cfs_ctl_q_in_enqueued);
2697
2698         error = cfil_data_service_ctl_q(so, kcunit, outgoing);
2699         if (error != 0) {
2700                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
2701                         error);
2702         }
2703         /*
2704          * We have to return EJUSTRETURN in all cases to avoid double free
2705          * by socket layer
2706          */
2707         error = EJUSTRETURN;
2708 done:
2709         CFIL_INFO_VERIFY(so->so_cfil);
2710
2711         CFIL_LOG(LOG_INFO, "return %d", error);
2712         return (error);
2713 }
2714
2715 /*
2716  * cfil_service_inject_queue() re-inject data that passed the
2717  * content filters
2718  */
2719 static int
2720 cfil_service_inject_queue(struct socket *so, int outgoing)
2721 {
2722         mbuf_t data;
2723         unsigned int datalen;
2724         int mbcnt;
2725         unsigned int copylen;
2726         errno_t error = 0;
2727         struct mbuf *copy = NULL;
2728         struct cfi_buf *cfi_buf;
2729         struct cfil_queue *inject_q;
2730         int need_rwakeup = 0;
2731
2732         if (so->so_cfil == NULL)
2733                 return (0);
2734
2735         CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
2736                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
2737
2738         socket_lock_assert_owned(so);
2739
2740         if (outgoing) {
2741                 cfi_buf = &so->so_cfil->cfi_snd;
2742                 so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
2743         } else {
2744                 cfi_buf = &so->so_cfil->cfi_rcv;
2745                 so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
2746         }
2747         inject_q = &cfi_buf->cfi_inject_q;
2748
2749         while ((data = cfil_queue_first(inject_q)) != NULL) {
2750                 datalen = cfil_data_length(data, &mbcnt);
2751
2752                 CFIL_LOG(LOG_INFO, "data %llx datalen %u",
2753                         (uint64_t)VM_KERNEL_ADDRPERM(data), datalen);
2754
2755                 /* Make a copy in case of injection error */
2756                 copy = m_copym_mode(data, 0, M_COPYALL, M_DONTWAIT,
2757                         M_COPYM_COPY_HDR);
2758                 if (copy == NULL) {
2759                         CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2760                         error = ENOMEM;
2761                         break;
2762                 }
2763
2764                 if ((copylen = m_length(copy)) != datalen)
2765                         panic("%s so %p copylen %d != datalen %d",
2766                                 __func__, so, copylen, datalen);
2767
2768                 if (outgoing) {
2769                         socket_unlock(so, 0);
2770
2771                         /*
2772                          * Set both DONTWAIT and NBIO flags are we really
2773                          * do not want to block
2774                          */
2775                         error = sosend(so, NULL, NULL,
2776                                         copy, NULL,
2777                                         MSG_SKIPCFIL | MSG_DONTWAIT | MSG_NBIO);
2778
2779                         socket_lock(so, 0);
2780
2781                         if (error != 0) {
2782                                 CFIL_LOG(LOG_ERR, "sosend() failed %d",
2783                                         error);
2784                         }
2785                 } else {
2786                         copy->m_flags |= M_SKIPCFIL;
2787
2788                         /*
2789                          * NOTE:
2790                          * This work only because we support plain TCP
2791                          * For UDP, RAWIP, MPTCP and message TCP we'll
2792                          * need to call the appropriate sbappendxxx()
2793                          * of fix sock_inject_data_in()
2794                          */
2795                         if (sbappendstream(&so->so_rcv, copy))
2796                                 need_rwakeup = 1;
2797                 }
2798
2799                 /* Need to reassess if filter is still attached after unlock */
2800                 if (so->so_cfil == NULL) {
2801                         CFIL_LOG(LOG_ERR, "so %llx cfil detached",
2802                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
2803                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_detached);
2804                         error = 0;
2805                         break;
2806                 }
2807                 if (error != 0)
2808                         break;
2809
2810                 /* Injection successful */
2811                 cfil_queue_remove(inject_q, data, datalen);
2812                 mbuf_freem(data);
2813
2814                 cfi_buf->cfi_pending_first += datalen;
2815                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
2816                 cfil_info_buf_verify(cfi_buf);
2817
2818                 if (outgoing)
2819                         OSAddAtomic64(datalen,
2820                                 &cfil_stats.cfs_inject_q_out_passed);
2821                 else
2822                         OSAddAtomic64(datalen,
2823                                 &cfil_stats.cfs_inject_q_in_passed);
2824         }
2825
2826         /* A single wakeup is for several packets is more efficient */
2827         if (need_rwakeup)
2828                 sorwakeup(so);
2829
2830         if (error != 0 && so->so_cfil) {
2831                 if (error == ENOBUFS)
2832                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
2833                 if (error == ENOMEM)
2834                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
2835
2836                 if (outgoing) {
2837                         so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_OUT;
2838                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
2839                 } else {
2840                         so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_IN;
2841                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
2842                 }
2843         }
2844
2845         /*
2846          * Notify
2847          */
2848         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_SHUT_WR)) {
2849                 cfil_sock_notify_shutdown(so, SHUT_WR);
2850                 if (cfil_sock_data_pending(&so->so_snd) == 0)
2851                         soshutdownlock_final(so, SHUT_WR);
2852         }
2853         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
2854                 if (cfil_filters_attached(so) == 0) {
2855                         CFIL_LOG(LOG_INFO, "so %llx waking",
2856                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
2857                         wakeup((caddr_t)&so->so_cfil);
2858                 }
2859         }
2860
2861         CFIL_INFO_VERIFY(so->so_cfil);
2862
2863         return (error);
2864 }
2865
2866 static int
2867 cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
2868 {
2869         uint64_t passlen, curlen;
2870         mbuf_t data;
2871         unsigned int datalen;
2872         errno_t error = 0;
2873         struct cfil_entry *entry;
2874         struct cfe_buf *entrybuf;
2875         struct cfil_queue *pending_q;
2876
2877         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2878                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2879
2880         socket_lock_assert_owned(so);
2881
2882         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2883         if (outgoing)
2884                 entrybuf = &entry->cfe_snd;
2885         else
2886                 entrybuf = &entry->cfe_rcv;
2887
2888         pending_q = &entrybuf->cfe_pending_q;
2889
2890         passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
2891
2892         /*
2893          * Locate the chunks of data that we can pass to the next filter
2894          * A data chunk must be on mbuf boundaries
2895          */
2896         curlen = 0;
2897         while ((data = cfil_queue_first(pending_q)) != NULL) {
2898                 datalen = cfil_data_length(data, NULL);
2899
2900                 CFIL_LOG(LOG_INFO,
2901                         "data %llx datalen %u passlen %llu curlen %llu",
2902                         (uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
2903                         passlen, curlen);
2904
2905                 if (curlen + datalen > passlen)
2906                         break;
2907
2908                 cfil_queue_remove(pending_q, data, datalen);
2909
2910                 curlen += datalen;
2911
2912                 for (kcunit += 1;
2913                         kcunit <= MAX_CONTENT_FILTER;
2914                         kcunit++) {
2915                         error = cfil_data_filter(so, kcunit, outgoing,
2916                                 data, datalen);
2917                         /* 0 means passed so we can continue */
2918                         if (error != 0)
2919                                 break;
2920                 }
2921                 /* When data has passed all filters, re-inject */
2922                 if (error == 0) {
2923                         if (outgoing) {
2924                                 cfil_queue_enqueue(
2925                                         &so->so_cfil->cfi_snd.cfi_inject_q,
2926                                         data, datalen);
2927                                 OSAddAtomic64(datalen,
2928                                         &cfil_stats.cfs_inject_q_out_enqueued);
2929                         } else {
2930                                 cfil_queue_enqueue(
2931                                         &so->so_cfil->cfi_rcv.cfi_inject_q,
2932                                         data, datalen);
2933                                 OSAddAtomic64(datalen,
2934                                         &cfil_stats.cfs_inject_q_in_enqueued);
2935                         }
2936                 }
2937         }
2938
2939         CFIL_INFO_VERIFY(so->so_cfil);
2940
2941         return (error);
2942 }
2943
2944 int
2945 cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
2946         uint64_t pass_offset, uint64_t peek_offset)
2947 {
2948         errno_t error = 0;
2949         struct cfil_entry *entry = NULL;
2950         struct cfe_buf *entrybuf;
2951         int updated = 0;
2952
2953         CFIL_LOG(LOG_INFO, "pass %llu peek %llu", pass_offset, peek_offset);
2954
2955         socket_lock_assert_owned(so);
2956
2957         if (so->so_cfil == NULL) {
2958                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
2959                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2960                 error = 0;
2961                 goto done;
2962         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
2963                 CFIL_LOG(LOG_ERR, "so %llx drop set",
2964                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2965                 error = EPIPE;
2966                 goto done;
2967         }
2968
2969         entry = &so->so_cfil->cfi_entries[kcunit - 1];
2970         if (outgoing)
2971                 entrybuf = &entry->cfe_snd;
2972         else
2973                 entrybuf = &entry->cfe_rcv;
2974
2975         /* Record updated offsets for this content filter */
2976         if (pass_offset > entrybuf->cfe_pass_offset) {
2977                 entrybuf->cfe_pass_offset = pass_offset;
2978
2979                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
2980                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
2981                 updated = 1;
2982         } else {
2983                 CFIL_LOG(LOG_INFO, "pass_offset %llu <= cfe_pass_offset %llu",
2984                         pass_offset, entrybuf->cfe_pass_offset);
2985         }
2986         /* Filter does not want or need to see data that's allowed to pass */
2987         if (peek_offset > entrybuf->cfe_pass_offset &&
2988                 peek_offset > entrybuf->cfe_peek_offset) {
2989                 entrybuf->cfe_peek_offset = peek_offset;
2990                 updated = 1;
2991         }
2992         /* Nothing to do */
2993         if (updated == 0)
2994                 goto done;
2995
2996         /* Move data held in control queue to pending queue if needed */
2997         error = cfil_data_service_ctl_q(so, kcunit, outgoing);
2998         if (error != 0) {
2999                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
3000                         error);
3001                 goto done;
3002         }
3003         error = EJUSTRETURN;
3004
3005 done:
3006         /*
3007          * The filter is effectively detached when pass all from both sides
3008          * or when the socket is closed and no more data is waiting
3009          * to be delivered to the filter
3010          */
3011         if (entry != NULL &&
3012             ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
3013             entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
3014             ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3015             cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
3016             cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
3017                 entry->cfe_flags |= CFEF_CFIL_DETACHED;
3018                 CFIL_LOG(LOG_INFO, "so %llx detached %u",
3019                         (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3020                 if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3021                     cfil_filters_attached(so) == 0) {
3022                         CFIL_LOG(LOG_INFO, "so %llx waking",
3023                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
3024                         wakeup((caddr_t)&so->so_cfil);
3025                 }
3026         }
3027         CFIL_INFO_VERIFY(so->so_cfil);
3028         CFIL_LOG(LOG_INFO, "return %d", error);
3029         return (error);
3030 }
3031
3032 /*
3033  * Update pass offset for socket when no data is pending
3034  */
3035 static int
3036 cfil_set_socket_pass_offset(struct socket *so, int outgoing)
3037 {
3038         struct cfi_buf *cfi_buf;
3039         struct cfil_entry *entry;
3040         struct cfe_buf *entrybuf;
3041         uint32_t kcunit;
3042         uint64_t pass_offset = 0;
3043
3044         if (so->so_cfil == NULL)
3045                 return (0);
3046
3047         CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
3048                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3049
3050         socket_lock_assert_owned(so);
3051
3052         if (outgoing)
3053                 cfi_buf = &so->so_cfil->cfi_snd;
3054         else
3055                 cfi_buf = &so->so_cfil->cfi_rcv;
3056
3057         if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
3058                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3059                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3060
3061                         /* Are we attached to a filter? */
3062                         if (entry->cfe_filter == NULL)
3063                                 continue;
3064
3065                         if (outgoing)
3066                                 entrybuf = &entry->cfe_snd;
3067                         else
3068                                 entrybuf = &entry->cfe_rcv;
3069
3070                         if (pass_offset == 0 ||
3071                             entrybuf->cfe_pass_offset < pass_offset)
3072                                 pass_offset = entrybuf->cfe_pass_offset;
3073                 }
3074                 cfi_buf->cfi_pass_offset = pass_offset;
3075         }
3076
3077         return (0);
3078 }
3079
3080 int
3081 cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
3082         uint64_t pass_offset, uint64_t peek_offset)
3083 {
3084         errno_t error = 0;
3085
3086         CFIL_LOG(LOG_INFO, "");
3087
3088         socket_lock_assert_owned(so);
3089
3090         error = cfil_acquire_sockbuf(so, outgoing);
3091         if (error != 0) {
3092                 CFIL_LOG(LOG_INFO, "so %llx %s dropped",
3093                         (uint64_t)VM_KERNEL_ADDRPERM(so),
3094                         outgoing ? "out" : "in");
3095                 goto release;
3096         }
3097
3098         error = cfil_update_data_offsets(so, kcunit, outgoing,
3099                 pass_offset, peek_offset);
3100
3101         cfil_service_inject_queue(so, outgoing);
3102
3103         cfil_set_socket_pass_offset(so, outgoing);
3104 release:
3105         CFIL_INFO_VERIFY(so->so_cfil);
3106         cfil_release_sockbuf(so, outgoing);
3107
3108         return (error);
3109 }
3110
3111
3112 static void
3113 cfil_flush_queues(struct socket *so)
3114 {
3115         struct cfil_entry *entry;
3116         int kcunit;
3117         uint64_t drained;
3118
3119         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3120                 goto done;
3121
3122         socket_lock_assert_owned(so);
3123
3124         /*
3125          * Flush the output queues and ignore errors as long as
3126          * we are attached
3127          */
3128         (void) cfil_acquire_sockbuf(so, 1);
3129         if (so->so_cfil != NULL) {
3130                 drained = 0;
3131                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3132                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3133
3134                         drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
3135                         drained += cfil_queue_drain(
3136                             &entry->cfe_snd.cfe_pending_q);
3137                 }
3138                 drained += cfil_queue_drain(&so->so_cfil->cfi_snd.cfi_inject_q);
3139                 if (drained) {
3140                         if (so->so_cfil->cfi_flags & CFIF_DROP)
3141                                 OSIncrementAtomic(
3142                                         &cfil_stats.cfs_flush_out_drop);
3143                         else
3144                                 OSIncrementAtomic(
3145                                         &cfil_stats.cfs_flush_out_close);
3146                 }
3147         }
3148         cfil_release_sockbuf(so, 1);
3149
3150         /*
3151          * Flush the input queues
3152          */
3153         (void) cfil_acquire_sockbuf(so, 0);
3154         if (so->so_cfil != NULL) {
3155                 drained = 0;
3156                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3157                         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3158
3159                                 drained += cfil_queue_drain(
3160                                         &entry->cfe_rcv.cfe_ctl_q);
3161                                 drained += cfil_queue_drain(
3162                                         &entry->cfe_rcv.cfe_pending_q);
3163                 }
3164                 drained += cfil_queue_drain(&so->so_cfil->cfi_rcv.cfi_inject_q);
3165                 if (drained) {
3166                         if (so->so_cfil->cfi_flags & CFIF_DROP)
3167                                 OSIncrementAtomic(
3168                                         &cfil_stats.cfs_flush_in_drop);
3169                         else
3170                                 OSIncrementAtomic(
3171                                         &cfil_stats.cfs_flush_in_close);
3172                 }
3173         }
3174         cfil_release_sockbuf(so, 0);
3175 done:
3176         CFIL_INFO_VERIFY(so->so_cfil);
3177 }
3178
3179 int
3180 cfil_action_drop(struct socket *so, uint32_t kcunit)
3181 {
3182         errno_t error = 0;
3183         struct cfil_entry *entry;
3184         struct proc *p;
3185
3186         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3187                 goto done;
3188
3189         socket_lock_assert_owned(so);
3190
3191         entry = &so->so_cfil->cfi_entries[kcunit - 1];
3192
3193         /* Are we attached to the filter? */
3194         if (entry->cfe_filter == NULL)
3195                 goto done;
3196
3197         so->so_cfil->cfi_flags |= CFIF_DROP;
3198
3199         p = current_proc();
3200
3201         /*
3202          * Force the socket to be marked defunct
3203          * (forcing fixed along with rdar://19391339)
3204          */
3205         error = sosetdefunct(p, so,
3206             SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
3207             FALSE);
3208
3209         /* Flush the socket buffer and disconnect */
3210         if (error == 0)
3211                 error = sodefunct(p, so,
3212                     SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
3213
3214         /* The filter is done, mark as detached */
3215         entry->cfe_flags |= CFEF_CFIL_DETACHED;
3216         CFIL_LOG(LOG_INFO, "so %llx detached %u",
3217                 (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3218
3219         /* Pending data needs to go */
3220         cfil_flush_queues(so);
3221
3222         if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
3223                 if (cfil_filters_attached(so) == 0) {
3224                         CFIL_LOG(LOG_INFO, "so %llx waking",
3225                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
3226                         wakeup((caddr_t)&so->so_cfil);
3227                 }
3228         }
3229 done:
3230         return (error);
3231 }
3232
3233 static int
3234 cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
3235 {
3236         struct cfil_entry *entry;
3237         struct cfe_buf *entrybuf;
3238         uint32_t kcunit;
3239
3240         CFIL_LOG(LOG_INFO, "so %llx outgoing %d datalen %u",
3241                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
3242
3243         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3244                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
3245
3246                 /* Are we attached to the filter? */
3247                 if (entry->cfe_filter == NULL)
3248                         continue;
3249
3250                 if (outgoing)
3251                         entrybuf = &entry->cfe_snd;
3252                 else
3253                         entrybuf = &entry->cfe_rcv;
3254
3255                 entrybuf->cfe_ctl_q.q_start += datalen;
3256                 entrybuf->cfe_pass_offset = entrybuf->cfe_ctl_q.q_start;
3257                 entrybuf->cfe_peeked = entrybuf->cfe_ctl_q.q_start;
3258                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
3259                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
3260
3261                 entrybuf->cfe_ctl_q.q_end += datalen;
3262
3263                 entrybuf->cfe_pending_q.q_start += datalen;
3264                 entrybuf->cfe_pending_q.q_end += datalen;
3265         }
3266         CFIL_INFO_VERIFY(so->so_cfil);
3267         return (0);
3268 }
3269
3270 int
3271 cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to,
3272                 struct mbuf *data, struct mbuf *control, uint32_t flags)
3273 {
3274 #pragma unused(to, control, flags)
3275         errno_t error = 0;
3276         unsigned int datalen;
3277         int mbcnt;
3278         int kcunit;
3279         struct cfi_buf *cfi_buf;
3280
3281         if (so->so_cfil == NULL) {
3282                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
3283                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3284                 error = 0;
3285                 goto done;
3286         } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
3287                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3288                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3289                 error = EPIPE;
3290                 goto done;
3291         }
3292
3293         datalen = cfil_data_length(data, &mbcnt);
3294
3295         CFIL_LOG(LOG_INFO, "so %llx %s m %llx len %u flags 0x%x nextpkt %llx",
3296                 (uint64_t)VM_KERNEL_ADDRPERM(so),
3297                 outgoing ? "out" : "in",
3298                 (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
3299                 (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt));
3300
3301         if (outgoing)
3302                 cfi_buf = &so->so_cfil->cfi_snd;
3303         else
3304                 cfi_buf = &so->so_cfil->cfi_rcv;
3305
3306         cfi_buf->cfi_pending_last += datalen;
3307         cfi_buf->cfi_pending_mbcnt += mbcnt;
3308         cfil_info_buf_verify(cfi_buf);
3309
3310         CFIL_LOG(LOG_INFO, "so %llx cfi_pending_last %llu cfi_pass_offset %llu",
3311                 (uint64_t)VM_KERNEL_ADDRPERM(so),
3312                 cfi_buf->cfi_pending_last,
3313                 cfi_buf->cfi_pass_offset);
3314
3315         /* Fast path when below pass offset */
3316         if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
3317                 cfil_update_entry_offsets(so, outgoing, datalen);
3318         } else {
3319                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3320                         error = cfil_data_filter(so, kcunit, outgoing, data,
3321                                 datalen);
3322                         /* 0 means passed so continue with next filter */
3323                         if (error != 0)
3324                                 break;
3325                 }
3326         }
3327
3328         /* Move cursor if no filter claimed the data */
3329         if (error == 0) {
3330                 cfi_buf->cfi_pending_first += datalen;
3331                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
3332                 cfil_info_buf_verify(cfi_buf);
3333         }
3334 done:
3335         CFIL_INFO_VERIFY(so->so_cfil);
3336
3337         return (error);
3338 }
3339
3340 /*
3341  * Callback from socket layer sosendxxx()
3342  */
3343 int
3344 cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
3345                 struct mbuf *data, struct mbuf *control, uint32_t flags)
3346 {
3347         int error = 0;
3348
3349         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3350                 return (0);
3351
3352         socket_lock_assert_owned(so);
3353
3354         if (so->so_cfil->cfi_flags & CFIF_DROP) {
3355                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3356                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3357                 return (EPIPE);
3358         }
3359         if (control != NULL) {
3360                 CFIL_LOG(LOG_ERR, "so %llx control",
3361                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3362                 OSIncrementAtomic(&cfil_stats.cfs_data_out_control);
3363         }
3364         if ((flags & MSG_OOB)) {
3365                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3366                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3367                 OSIncrementAtomic(&cfil_stats.cfs_data_out_oob);
3368         }
3369         if ((so->so_snd.sb_flags & SB_LOCK) == 0)
3370                 panic("so %p SB_LOCK not set", so);
3371
3372         if (so->so_snd.sb_cfil_thread != NULL)
3373                 panic("%s sb_cfil_thread %p not NULL", __func__,
3374                         so->so_snd.sb_cfil_thread);
3375
3376         error = cfil_data_common(so, 1, to, data, control, flags);
3377
3378         return (error);
3379 }
3380
3381 /*
3382  * Callback from socket layer sbappendxxx()
3383  */
3384 int
3385 cfil_sock_data_in(struct socket *so, struct sockaddr *from,
3386         struct mbuf *data, struct mbuf *control, uint32_t flags)
3387 {
3388         int error = 0;
3389
3390         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3391                 return (0);
3392
3393         socket_lock_assert_owned(so);
3394
3395         if (so->so_cfil->cfi_flags & CFIF_DROP) {
3396                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3397                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3398                 return (EPIPE);
3399         }
3400         if (control != NULL) {
3401                 CFIL_LOG(LOG_ERR, "so %llx control",
3402                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3403                 OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
3404         }
3405         if (data->m_type == MT_OOBDATA) {
3406                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3407                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3408                 OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
3409         }
3410         error = cfil_data_common(so, 0, from, data, control, flags);
3411
3412         return (error);
3413 }
3414
3415 /*
3416  * Callback from socket layer soshutdownxxx()
3417  *
3418  * We may delay the shutdown write if there's outgoing data in process.
3419  *
3420  * There is no point in delaying the shutdown read because the process
3421  * indicated that it does not want to read anymore data.
3422  */
3423 int
3424 cfil_sock_shutdown(struct socket *so, int *how)
3425 {
3426         int error = 0;
3427
3428         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3429                 goto done;
3430
3431         socket_lock_assert_owned(so);
3432
3433         CFIL_LOG(LOG_INFO, "so %llx how %d",
3434                 (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
3435
3436         /*
3437          * Check the state of the socket before the content filter
3438          */
3439         if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
3440                 /* read already shut down */
3441                 error = ENOTCONN;
3442                 goto done;
3443         }
3444         if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
3445                 /* write already shut down */
3446                 error = ENOTCONN;
3447                 goto done;
3448         }
3449
3450         if ((so->so_cfil->cfi_flags & CFIF_DROP) != 0) {
3451                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3452                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3453                 goto done;
3454         }
3455
3456         /*
3457          * shutdown read: SHUT_RD or SHUT_RDWR
3458          */
3459         if (*how != SHUT_WR) {
3460                 if (so->so_cfil->cfi_flags & CFIF_SHUT_RD) {
3461                         error = ENOTCONN;
3462                         goto done;
3463                 }
3464                 so->so_cfil->cfi_flags |= CFIF_SHUT_RD;
3465                 cfil_sock_notify_shutdown(so, SHUT_RD);
3466         }
3467         /*
3468          * shutdown write: SHUT_WR or SHUT_RDWR
3469          */
3470         if (*how != SHUT_RD) {
3471                 if (so->so_cfil->cfi_flags & CFIF_SHUT_WR) {
3472                         error = ENOTCONN;
3473                         goto done;
3474                 }
3475                 so->so_cfil->cfi_flags |= CFIF_SHUT_WR;
3476                 cfil_sock_notify_shutdown(so, SHUT_WR);
3477                 /*
3478                  * When outgoing data is pending, we delay the shutdown at the
3479                  * protocol level until the content filters give the final
3480                  * verdict on the pending data.
3481                  */
3482                 if (cfil_sock_data_pending(&so->so_snd) != 0) {
3483                         /*
3484                          * When shutting down the read and write sides at once
3485                          * we can proceed to the final shutdown of the read
3486                          * side. Otherwise, we just return.
3487                          */
3488                         if (*how == SHUT_WR) {
3489                                 error = EJUSTRETURN;
3490                         } else if (*how == SHUT_RDWR) {
3491                                 *how = SHUT_RD;
3492                         }
3493                 }
3494         }
3495 done:
3496         return (error);
3497 }
3498
3499 /*
3500  * This is called when the socket is closed and there is no more
3501  * opportunity for filtering
3502  */
3503 void
3504 cfil_sock_is_closed(struct socket *so)
3505 {
3506         errno_t error = 0;
3507         int kcunit;
3508
3509         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3510                 return;
3511
3512         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3513
3514         socket_lock_assert_owned(so);
3515
3516         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3517                 /* Let the filters know of the closing */
3518                 error = cfil_dispatch_closed_event(so, kcunit);
3519         }
3520
3521         /* Last chance to push passed data out */
3522         error = cfil_acquire_sockbuf(so, 1);
3523         if (error == 0)
3524                 cfil_service_inject_queue(so, 1);
3525         cfil_release_sockbuf(so, 1);
3526
3527         so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
3528
3529         /* Pending data needs to go */
3530         cfil_flush_queues(so);
3531
3532         CFIL_INFO_VERIFY(so->so_cfil);
3533 }
3534
3535 /*
3536  * This is called when the socket is disconnected so let the filters
3537  * know about the disconnection and that no more data will come
3538  *
3539  * The how parameter has the same values as soshutown()
3540  */
3541 void
3542 cfil_sock_notify_shutdown(struct socket *so, int how)
3543 {
3544         errno_t error = 0;
3545         int kcunit;
3546
3547         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3548                 return;
3549
3550         CFIL_LOG(LOG_INFO, "so %llx how %d",
3551                 (uint64_t)VM_KERNEL_ADDRPERM(so), how);
3552
3553         socket_lock_assert_owned(so);
3554
3555         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3556                 /* Disconnect incoming side */
3557                 if (how != SHUT_WR)
3558                         error = cfil_dispatch_disconnect_event(so, kcunit, 0);
3559                 /* Disconnect outgoing side */
3560                 if (how != SHUT_RD)
3561                         error = cfil_dispatch_disconnect_event(so, kcunit, 1);
3562         }
3563 }
3564
3565 static int
3566 cfil_filters_attached(struct socket *so)
3567 {
3568         struct cfil_entry *entry;
3569         uint32_t kcunit;
3570         int attached = 0;
3571
3572         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3573                 return (0);
3574
3575         socket_lock_assert_owned(so);
3576
3577         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3578                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
3579
3580                 /* Are we attached to the filter? */
3581                 if (entry->cfe_filter == NULL)
3582                         continue;
3583                 if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
3584                         continue;
3585                 if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0)
3586                         continue;
3587                 attached = 1;
3588                 break;
3589         }
3590
3591         return (attached);
3592 }
3593
3594 /*
3595  * This is called when the socket is closed and we are waiting for
3596  * the filters to gives the final pass or drop
3597  */
3598 void
3599 cfil_sock_close_wait(struct socket *so)
3600 {
3601         lck_mtx_t *mutex_held;
3602         struct timespec ts;
3603         int error;
3604
3605         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3606                 return;
3607
3608         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3609
3610         if (so->so_proto->pr_getlock != NULL)
3611                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3612         else
3613                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3614         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3615
3616         while (cfil_filters_attached(so)) {
3617                 /*
3618                  * Notify the filters we are going away so they can detach
3619                  */
3620                 cfil_sock_notify_shutdown(so, SHUT_RDWR);
3621
3622                 /*
3623                  * Make sure we need to wait after the filter are notified
3624                  * of the disconnection
3625                  */
3626                 if (cfil_filters_attached(so) == 0)
3627                         break;
3628
3629                 CFIL_LOG(LOG_INFO, "so %llx waiting",
3630                         (uint64_t)VM_KERNEL_ADDRPERM(so));
3631
3632                 ts.tv_sec = cfil_close_wait_timeout / 1000;
3633                 ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
3634                         NSEC_PER_USEC * 1000;
3635
3636                 OSIncrementAtomic(&cfil_stats.cfs_close_wait);
3637                 so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
3638                 error = msleep((caddr_t)&so->so_cfil, mutex_held,
3639                         PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
3640                 so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
3641
3642                 CFIL_LOG(LOG_NOTICE, "so %llx timed out %d",
3643                         (uint64_t)VM_KERNEL_ADDRPERM(so), (error != 0));
3644
3645                 /*
3646                  * Force close in case of timeout
3647                  */
3648                 if (error != 0) {
3649                         OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
3650                         break;
3651                 }
3652         }
3653
3654 }
3655
3656 /*
3657  * Returns the size of the data held by the content filter by using
3658  */
3659 int32_t
3660 cfil_sock_data_pending(struct sockbuf *sb)
3661 {
3662         struct socket *so = sb->sb_so;
3663         uint64_t pending = 0;
3664
3665         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
3666                 struct cfi_buf *cfi_buf;
3667
3668                 socket_lock_assert_owned(so);
3669
3670                 if ((sb->sb_flags & SB_RECV) == 0)
3671                         cfi_buf = &so->so_cfil->cfi_snd;
3672                 else
3673                         cfi_buf = &so->so_cfil->cfi_rcv;
3674
3675                 pending = cfi_buf->cfi_pending_last -
3676                         cfi_buf->cfi_pending_first;
3677
3678                 /*
3679                  * If we are limited by the "chars of mbufs used" roughly
3680                  * adjust so we won't overcommit
3681                  */
3682                 if (pending > (uint64_t)cfi_buf->cfi_pending_mbcnt)
3683                         pending = cfi_buf->cfi_pending_mbcnt;
3684         }
3685
3686         VERIFY(pending < INT32_MAX);
3687
3688         return (int32_t)(pending);
3689 }
3690
3691 /*
3692  * Return the socket buffer space used by data being held by content filters
3693  * so processes won't clog the socket buffer
3694  */
3695 int32_t
3696 cfil_sock_data_space(struct sockbuf *sb)
3697 {
3698         struct socket *so = sb->sb_so;
3699         uint64_t pending = 0;
3700
3701         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
3702                 so->so_snd.sb_cfil_thread != current_thread()) {
3703                 struct cfi_buf *cfi_buf;
3704
3705                 socket_lock_assert_owned(so);
3706
3707                 if ((sb->sb_flags & SB_RECV) == 0)
3708                         cfi_buf = &so->so_cfil->cfi_snd;
3709                 else
3710                         cfi_buf = &so->so_cfil->cfi_rcv;
3711
3712                 pending = cfi_buf->cfi_pending_last -
3713                         cfi_buf->cfi_pending_first;
3714
3715                 /*
3716                  * If we are limited by the "chars of mbufs used" roughly
3717                  * adjust so we won't overcommit
3718                  */
3719                 if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending)
3720                         pending = cfi_buf->cfi_pending_mbcnt;
3721         }
3722
3723         VERIFY(pending < INT32_MAX);
3724
3725         return (int32_t)(pending);
3726 }
3727
3728 /*
3729  * A callback from the socket and protocol layer when data becomes
3730  * available in the socket buffer to give a chance for the content filter
3731  * to re-inject data that was held back
3732  */
3733 void
3734 cfil_sock_buf_update(struct sockbuf *sb)
3735 {
3736         int outgoing;
3737         int error;
3738         struct socket *so = sb->sb_so;
3739
3740         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3741                 return;
3742
3743         if (!cfil_sbtrim)
3744                 return;
3745
3746         socket_lock_assert_owned(so);
3747
3748         if ((sb->sb_flags & SB_RECV) == 0) {
3749                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0)
3750                         return;
3751                 outgoing = 1;
3752                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
3753         } else {
3754                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_IN) == 0)
3755                         return;
3756                 outgoing = 0;
3757                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
3758         }
3759
3760         CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
3761                 (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3762
3763         error = cfil_acquire_sockbuf(so, outgoing);
3764         if (error == 0)
3765                 cfil_service_inject_queue(so, outgoing);
3766         cfil_release_sockbuf(so, outgoing);
3767 }
3768
3769 int
3770 sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
3771         struct sysctl_req *req)
3772 {
3773 #pragma unused(oidp, arg1, arg2)
3774         int error = 0;
3775         size_t len = 0;
3776         u_int32_t i;
3777
3778         /* Read only  */
3779         if (req->newptr != USER_ADDR_NULL)
3780                 return (EPERM);
3781
3782         cfil_rw_lock_shared(&cfil_lck_rw);
3783
3784         for (i = 0; content_filters != NULL && i < MAX_CONTENT_FILTER; i++) {
3785                 struct cfil_filter_stat filter_stat;
3786                 struct content_filter *cfc = content_filters[i];
3787
3788                 if (cfc == NULL)
3789                         continue;
3790
3791                 /* If just asking for the size */
3792                 if (req->oldptr == USER_ADDR_NULL) {
3793                         len += sizeof(struct cfil_filter_stat);
3794                         continue;
3795                 }
3796
3797                 bzero(&filter_stat, sizeof(struct cfil_filter_stat));
3798                 filter_stat.cfs_len = sizeof(struct cfil_filter_stat);
3799                 filter_stat.cfs_filter_id = cfc->cf_kcunit;
3800                 filter_stat.cfs_flags = cfc->cf_flags;
3801                 filter_stat.cfs_sock_count = cfc->cf_sock_count;
3802                 filter_stat.cfs_necp_control_unit = cfc->cf_necp_control_unit;
3803
3804                 error = SYSCTL_OUT(req, &filter_stat,
3805                         sizeof (struct cfil_filter_stat));
3806                 if (error != 0)
3807                         break;
3808         }
3809         /* If just asking for the size */
3810         if (req->oldptr == USER_ADDR_NULL)
3811                 req->oldidx = len;
3812
3813         cfil_rw_unlock_shared(&cfil_lck_rw);
3814
3815         return (error);
3816 }
3817
3818 static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
3819         struct sysctl_req *req)
3820 {
3821 #pragma unused(oidp, arg1, arg2)
3822         int error = 0;
3823         u_int32_t i;
3824         struct cfil_info *cfi;
3825
3826         /* Read only  */
3827         if (req->newptr != USER_ADDR_NULL)
3828                 return (EPERM);
3829
3830         cfil_rw_lock_shared(&cfil_lck_rw);
3831
3832         /*
3833          * If just asking for the size,
3834          */
3835         if (req->oldptr == USER_ADDR_NULL) {
3836                 req->oldidx = cfil_sock_attached_count *
3837                         sizeof(struct cfil_sock_stat);
3838                 /* Bump the length in case new sockets gets attached */
3839                 req->oldidx += req->oldidx >> 3;
3840                 goto done;
3841         }
3842
3843         TAILQ_FOREACH(cfi, &cfil_sock_head, cfi_link) {
3844                 struct cfil_entry *entry;
3845                 struct cfil_sock_stat stat;
3846                 struct socket *so = cfi->cfi_so;
3847
3848                 bzero(&stat, sizeof(struct cfil_sock_stat));
3849                 stat.cfs_len = sizeof(struct cfil_sock_stat);
3850                 stat.cfs_sock_id = cfi->cfi_sock_id;
3851                 stat.cfs_flags = cfi->cfi_flags;
3852
3853                 if (so != NULL) {
3854                         stat.cfs_pid = so->last_pid;
3855                         memcpy(stat.cfs_uuid, so->last_uuid,
3856                                 sizeof(uuid_t));
3857                         if (so->so_flags & SOF_DELEGATED) {
3858                                 stat.cfs_e_pid = so->e_pid;
3859                                 memcpy(stat.cfs_e_uuid, so->e_uuid,
3860                                         sizeof(uuid_t));
3861                         } else {
3862                                 stat.cfs_e_pid = so->last_pid;
3863                                 memcpy(stat.cfs_e_uuid, so->last_uuid,
3864                                         sizeof(uuid_t));
3865                         }
3866                 }
3867
3868                 stat.cfs_snd.cbs_pending_first =
3869                         cfi->cfi_snd.cfi_pending_first;
3870                 stat.cfs_snd.cbs_pending_last =
3871                         cfi->cfi_snd.cfi_pending_last;
3872                 stat.cfs_snd.cbs_inject_q_len =
3873                         cfil_queue_len(&cfi->cfi_snd.cfi_inject_q);
3874                 stat.cfs_snd.cbs_pass_offset =
3875                         cfi->cfi_snd.cfi_pass_offset;
3876
3877                 stat.cfs_rcv.cbs_pending_first =
3878                         cfi->cfi_rcv.cfi_pending_first;
3879                 stat.cfs_rcv.cbs_pending_last =
3880                         cfi->cfi_rcv.cfi_pending_last;
3881                 stat.cfs_rcv.cbs_inject_q_len =
3882                         cfil_queue_len(&cfi->cfi_rcv.cfi_inject_q);
3883                 stat.cfs_rcv.cbs_pass_offset =
3884                         cfi->cfi_rcv.cfi_pass_offset;
3885
3886                 for (i = 0; i < MAX_CONTENT_FILTER; i++) {
3887                         struct cfil_entry_stat *estat;
3888                         struct cfe_buf *ebuf;
3889                         struct cfe_buf_stat *sbuf;
3890
3891                         entry = &cfi->cfi_entries[i];
3892
3893                         estat = &stat.ces_entries[i];
3894
3895                         estat->ces_len = sizeof(struct cfil_entry_stat);
3896                         estat->ces_filter_id = entry->cfe_filter ?
3897                                 entry->cfe_filter->cf_kcunit : 0;
3898                         estat->ces_flags = entry->cfe_flags;
3899                         estat->ces_necp_control_unit =
3900                                 entry->cfe_necp_control_unit;
3901
3902                         estat->ces_last_event.tv_sec =
3903                                 (int64_t)entry->cfe_last_event.tv_sec;
3904                         estat->ces_last_event.tv_usec =
3905                                 (int64_t)entry->cfe_last_event.tv_usec;
3906
3907                         estat->ces_last_action.tv_sec =
3908                                 (int64_t)entry->cfe_last_action.tv_sec;
3909                         estat->ces_last_action.tv_usec =
3910                                 (int64_t)entry->cfe_last_action.tv_usec;
3911
3912                         ebuf = &entry->cfe_snd;
3913                         sbuf = &estat->ces_snd;
3914                         sbuf->cbs_pending_first =
3915                                 cfil_queue_offset_first(&ebuf->cfe_pending_q);
3916                         sbuf->cbs_pending_last =
3917                                 cfil_queue_offset_last(&ebuf->cfe_pending_q);
3918                         sbuf->cbs_ctl_first =
3919                                 cfil_queue_offset_first(&ebuf->cfe_ctl_q);
3920                         sbuf->cbs_ctl_last =
3921                                 cfil_queue_offset_last(&ebuf->cfe_ctl_q);
3922                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
3923                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
3924                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
3925
3926                         ebuf = &entry->cfe_rcv;
3927                         sbuf = &estat->ces_rcv;
3928                         sbuf->cbs_pending_first =
3929                                 cfil_queue_offset_first(&ebuf->cfe_pending_q);
3930                         sbuf->cbs_pending_last =
3931                                 cfil_queue_offset_last(&ebuf->cfe_pending_q);
3932                         sbuf->cbs_ctl_first =
3933                                 cfil_queue_offset_first(&ebuf->cfe_ctl_q);
3934                         sbuf->cbs_ctl_last =
3935                                 cfil_queue_offset_last(&ebuf->cfe_ctl_q);
3936                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
3937                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
3938                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
3939                 }
3940                 error = SYSCTL_OUT(req, &stat,
3941                         sizeof (struct cfil_sock_stat));
3942                 if (error != 0)
3943                         break;
3944         }
3945 done:
3946         cfil_rw_unlock_shared(&cfil_lck_rw);
3947
3948         return (error);
3949 }