bsd/net/content_filter.c

   1 /*
   2  * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*
  25  * THEORY OF OPERATION
  26  *
  27  * The socket content filter subsystem provides a way for user space agents to
  28  * make filtering decisions based on the content of the data being sent and
  29  * received by TCP/IP sockets.
  30  *
  31  * A content filter user space agents gets a copy of the data and the data is
  32  * also kept in kernel buffer until the user space agents makes a pass or drop
  33  * decision. This unidirectional flow of content avoids unnecessary data copies
  34  * back to the kernel.
  35  *
  36  * A user space filter agent opens a kernel control socket with the name
  37  * CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
  38  * When connected, a "struct content_filter" is created and set as the
  39  * "unitinfo" of the corresponding kernel control socket instance.
  40  *
  41  * The socket content filter subsystem exchanges messages with the user space
  42  * filter agent until an ultimate pass or drop decision is made by the
  43  * user space filter agent.
  44  *
  45  * It should be noted that messages about many TCP/IP sockets can be multiplexed
  46  * over a single kernel control socket.
  47  *
  48  * Notes:
  49  * - The current implementation is limited to TCP sockets.
  50  * - The current implementation supports up to two simultaneous content filters
  51  *   for the sake of simplicity of the implementation.
  52  *
  53  *
  54  * NECP FILTER CONTROL UNIT
  55  *
  56  * A user space filter agent uses the Network Extension Control Policy (NECP)
  57  * database to specify which TCP/IP sockets need to be filtered. The NECP
  58  * criteria may be based on a variety of properties like user ID or proc UUID.
  59  *
  60  * The NECP "filter control unit" is used by the socket content filter subsystem
  61  * to deliver the relevant TCP/IP content information to the appropriate
  62  * user space filter agent via its kernel control socket instance.
  63  * This works as follows:
  64  *
  65  * 1) The user space filter agent specifies an NECP filter control unit when
  66  *    in adds its filtering rules to the NECP database.
  67  *
  68  * 2) The user space filter agent also sets its NECP filter control unit on the
  69  *    content filter kernel control socket via the socket option
  70  *    CFIL_OPT_NECP_CONTROL_UNIT.
  71  *
  72  * 3) The NECP database is consulted to find out if a given TCP/IP socket
  73  *    needs to be subjected to content filtering and returns the corresponding
  74  *    NECP filter control unit  -- the NECP filter control unit is actually
  75  *    stored in the TCP/IP socket structure so the NECP lookup is really simple.
  76  *
  77  * 4) The NECP filter control unit is then used to find the corresponding
  78  *    kernel control socket instance.
  79  *
  80  * Note: NECP currently supports a single filter control unit per TCP/IP socket
  81  *       but this restriction may be soon lifted.
  82  *
  83  *
  84  * THE MESSAGING PROTOCOL
  85  *
  86  * The socket content filter subsystem and a user space filter agent
  87  * communicate over the kernel control socket via an asynchronous
  88  * messaging protocol (this is not a request-response protocol).
  89  * The socket content filter subsystem sends event messages to the user
  90  * space filter agent about the TCP/IP sockets it is interested to filter.
  91  * The user space filter agent sends action messages to either allow
  92  * data to pass or to disallow the data flow (and drop the connection).
  93  *
  94  * All messages over a content filter kernel control socket share the same
  95  * common header of type "struct cfil_msg_hdr". The message type tells if
  96  * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
  97  * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
  98  * Note the message header length field may be padded for alignment and can
  99  * be larger than the actual content of the message.
 100  * The field "cfm_op" describe the kind of event or action.
 101  *
 102  * Here are the kinds of content filter events:
 103  * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
 104  * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
 105  * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
 106  * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
 107  *
 108  *
 109  * EVENT MESSAGES
 110  *
 111  * The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
 112  * data that is being sent or received. The position of this span of data
 113  * in the data flow is described by a set of start and end offsets. These
 114  * are absolute 64 bits offsets. The first byte sent (or received) starts
 115  * at offset 0 and ends at offset 1. The length of the content data
 116  * is given by the difference between the end offset and the start offset.
 117  *
 118  * After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
 119  * CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
 120  * action message is sent by the user space filter agent.
 121  *
 122  * Note: absolute 64 bits offsets should be large enough for the foreseeable
 123  * future.  A 64-bits counter will wrap after 468 years at 10 Gbit/sec:
 124  *   2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
 125  *
 126  * They are two kinds of primary content filter actions:
 127  * - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
 128  * - CFM_OP_DROP: to shutdown socket and disallow further data flow
 129  *
 130  * There is also an action to mark a given client flow as already filtered
 131  * at a higher level, CFM_OP_BLESS_CLIENT.
 132  *
 133  *
 134  * ACTION MESSAGES
 135  *
 136  * The CFM_OP_DATA_UPDATE action messages let the user space filter
 137  * agent allow data to flow up to the specified pass offset -- there
 138  * is a pass offset for outgoing data and  a pass offset for incoming data.
 139  * When a new TCP/IP socket is attached to the content filter, each pass offset
 140  * is initially set to 0 so not data is allowed to pass by default.
 141  * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 142  * then the data flow becomes unrestricted.
 143  *
 144  * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
 145  * with a pass offset smaller than the pass offset of a previous
 146  * CFM_OP_DATA_UPDATE message is silently ignored.
 147  *
 148  * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
 149  * to tell the kernel how much data it wants to see by using the peek offsets.
 150  * Just like pass offsets, there is a peek offset for each direction.
 151  * When a new TCP/IP socket is attached to the content filter, each peek offset
 152  * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
 153  * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
 154  * with a greater than 0 peek offset is sent by the user space filter agent.
 155  * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
 156  * then the flow of update data events becomes unrestricted.
 157  *
 158  * Note that peek offsets cannot be smaller than the corresponding pass offset.
 159  * Also a peek offsets cannot be smaller than the corresponding end offset
 160  * of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
 161  * to set a too small peek value is silently ignored.
 162  *
 163  *
 164  * PER SOCKET "struct cfil_info"
 165  *
 166  * As soon as a TCP/IP socket gets attached to a content filter, a
 167  * "struct cfil_info" is created to hold the content filtering state for this
 168  * socket.
 169  *
 170  * The content filtering state is made of the following information
 171  * for each direction:
 172  * - The current pass offset;
 173  * - The first and last offsets of the data pending, waiting for a filtering
 174  *   decision;
 175  * - The inject queue for data that passed the filters and that needs
 176  *   to be re-injected;
 177  * - A content filter specific state in a set of  "struct cfil_entry"
 178  *
 179  *
 180  * CONTENT FILTER STATE "struct cfil_entry"
 181  *
 182  * The "struct cfil_entry" maintains the information most relevant to the
 183  * message handling over a kernel control socket with a user space filter agent.
 184  *
 185  * The "struct cfil_entry" holds the NECP filter control unit that corresponds
 186  * to the kernel control socket unit it corresponds to and also has a pointer
 187  * to the corresponding "struct content_filter".
 188  *
 189  * For each direction, "struct cfil_entry" maintains the following information:
 190  * - The pass offset
 191  * - The peek offset
 192  * - The offset of the last data peeked at by the filter
 193  * - A queue of data that's waiting to be delivered to the  user space filter
 194  *   agent on the kernel control socket
 195  * - A queue of data for which event messages have been sent on the kernel
 196  *   control socket and are pending for a filtering decision.
 197  *
 198  *
 199  * CONTENT FILTER QUEUES
 200  *
 201  * Data that is being filtered is steered away from the TCP/IP socket buffer
 202  * and instead will sit in one of three content filter queues until the data
 203  * can be re-injected into the TCP/IP socket buffer.
 204  *
 205  * A content filter queue is represented by "struct cfil_queue" that contains
 206  * a list of mbufs and the start and end offset of the data span of
 207  * the list of mbufs.
 208  *
 209  * The data moves into the three content filter queues according to this
 210  * sequence:
 211  * a) The "cfe_ctl_q" of "struct cfil_entry"
 212  * b) The "cfe_pending_q" of "struct cfil_entry"
 213  * c) The "cfi_inject_q" of "struct cfil_info"
 214  *
 215  * Note: The sequence (a),(b) may be repeated several times if there is more
 216  * than one content filter attached to the TCP/IP socket.
 217  *
 218  * The "cfe_ctl_q" queue holds data than cannot be delivered to the
 219  * kernel conntrol socket for two reasons:
 220  * - The peek offset is less that the end offset of the mbuf data
 221  * - The kernel control socket is flow controlled
 222  *
 223  * The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
 224  * CFM_OP_DATA_IN have been successfully dispatched to the kernel control
 225  * socket and are waiting for a pass action message fromn the user space
 226  * filter agent. An mbuf length must be fully allowed to pass to be removed
 227  * from the cfe_pending_q.
 228  *
 229  * The "cfi_inject_q" queue holds data that has been fully allowed to pass
 230  * by the user space filter agent and that needs to be re-injected into the
 231  * TCP/IP socket.
 232  *
 233  *
 234  * IMPACT ON FLOW CONTROL
 235  *
 236  * An essential aspect of the content filer subsystem is to minimize the
 237  * impact on flow control of the TCP/IP sockets being filtered.
 238  *
 239  * The processing overhead of the content filtering may have an effect on
 240  * flow control by adding noticeable delays and cannot be eliminated --
 241  * care must be taken by the user space filter agent to minimize the
 242  * processing delays.
 243  *
 244  * The amount of data being filtered is kept in buffers while waiting for
 245  * a decision by the user space filter agent. This amount of data pending
 246  * needs to be subtracted from the amount of data available in the
 247  * corresponding TCP/IP socket buffer. This is done by modifying
 248  * sbspace() and tcp_sbspace() to account for amount of data pending
 249  * in the content filter.
 250  *
 251  *
 252  * LOCKING STRATEGY
 253  *
 254  * The global state of content filter subsystem is protected by a single
 255  * read-write lock "cfil_lck_rw". The data flow can be done with the
 256  * cfil read-write lock held as shared so it can be re-entered from multiple
 257  * threads.
 258  *
 259  * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
 260  * protected by the socket lock.
 261  *
 262  * A TCP/IP socket lock cannot be taken while the cfil read-write lock
 263  * is held. That's why we have some sequences where we drop the cfil read-write
 264  * lock before taking the TCP/IP lock.
 265  *
 266  * It is also important to lock the TCP/IP socket buffer while the content
 267  * filter is modifying the amount of pending data. Otherwise the calculations
 268  * in sbspace() and tcp_sbspace()  could be wrong.
 269  *
 270  * The "cfil_lck_rw" protects "struct content_filter" and also the fields
 271  * "cfe_link" and "cfe_filter" of "struct cfil_entry".
 272  *
 273  * Actually "cfe_link" and "cfe_filter" are protected by both by
 274  * "cfil_lck_rw" and the socket lock: they may be modified only when
 275  * "cfil_lck_rw" is exclusive and the socket is locked.
 276  *
 277  * To read the other fields of "struct content_filter" we have to take
 278  * "cfil_lck_rw" in shared mode.
 279  *
 280  *
 281  * LIMITATIONS
 282  *
 283  * - For TCP sockets only
 284  *
 285  * - Does not support TCP unordered messages
 286  */
 287
 288 /*
 289  *      TO DO LIST
 290  *
 291  *      SOONER:
 292  *
 293  *      Deal with OOB
 294  *
 295  *      LATER:
 296  *
 297  *      If support datagram, enqueue control and address mbufs as well
 298  */
 299
 300 #include <sys/types.h>
 301 #include <sys/kern_control.h>
 302 #include <sys/queue.h>
 303 #include <sys/domain.h>
 304 #include <sys/protosw.h>
 305 #include <sys/syslog.h>
 306 #include <sys/systm.h>
 307 #include <sys/param.h>
 308 #include <sys/mbuf.h>
 309
 310 #include <kern/locks.h>
 311 #include <kern/zalloc.h>
 312 #include <kern/debug.h>
 313
 314 #include <net/content_filter.h>
 315
 316 #include <netinet/in_pcb.h>
 317 #include <netinet/tcp.h>
 318 #include <netinet/tcp_var.h>
 319 #include <netinet/udp.h>
 320 #include <netinet/udp_var.h>
 321
 322 #include <string.h>
 323 #include <libkern/libkern.h>
 324 #include <kern/sched_prim.h>
 325
 326 #define MAX_CONTENT_FILTER 2
 327
 328 struct cfil_entry;
 329
 330 /*
 331  * The structure content_filter represents a user space content filter
 332  * It's created and associated with a kernel control socket instance
 333  */
 334 struct content_filter {
 335         kern_ctl_ref            cf_kcref;
 336         u_int32_t               cf_kcunit;
 337         u_int32_t               cf_flags;
 338
 339         uint32_t                cf_necp_control_unit;
 340
 341         uint32_t                cf_sock_count;
 342         TAILQ_HEAD(, cfil_entry) cf_sock_entries;
 343 };
 344
 345 #define CFF_ACTIVE              0x01
 346 #define CFF_DETACHING           0x02
 347 #define CFF_FLOW_CONTROLLED     0x04
 348
 349 struct content_filter **content_filters = NULL;
 350 uint32_t cfil_active_count = 0; /* Number of active content filters */
 351 uint32_t cfil_sock_attached_count = 0;  /* Number of sockets attachements */
 352 uint32_t cfil_sock_udp_attached_count = 0;      /* Number of UDP sockets attachements */
 353 uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
 354
 355 static kern_ctl_ref cfil_kctlref = NULL;
 356
 357 static lck_grp_attr_t *cfil_lck_grp_attr = NULL;
 358 static lck_attr_t *cfil_lck_attr = NULL;
 359 static lck_grp_t *cfil_lck_grp = NULL;
 360 decl_lck_rw_data(static, cfil_lck_rw);
 361
 362 #define CFIL_RW_LCK_MAX 8
 363
 364 int cfil_rw_nxt_lck = 0;
 365 void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
 366
 367 int cfil_rw_nxt_unlck = 0;
 368 void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
 369
 370 #define CONTENT_FILTER_ZONE_NAME        "content_filter"
 371 #define CONTENT_FILTER_ZONE_MAX         10
 372 static struct zone *content_filter_zone = NULL; /* zone for content_filter */
 373
 374
 375 #define CFIL_INFO_ZONE_NAME     "cfil_info"
 376 #define CFIL_INFO_ZONE_MAX      1024
 377 static struct zone *cfil_info_zone = NULL;      /* zone for cfil_info */
 378
 379 MBUFQ_HEAD(cfil_mqhead);
 380
 381 struct cfil_queue {
 382         uint64_t                q_start; /* offset of first byte in queue */
 383         uint64_t                q_end; /* offset of last byte in queue */
 384         struct cfil_mqhead      q_mq;
 385 };
 386
 387 /*
 388  * struct cfil_entry
 389  *
 390  * The is one entry per content filter
 391  */
 392 struct cfil_entry {
 393         TAILQ_ENTRY(cfil_entry) cfe_link;
 394         struct content_filter   *cfe_filter;
 395
 396         struct cfil_info        *cfe_cfil_info;
 397         uint32_t                cfe_flags;
 398         uint32_t                cfe_necp_control_unit;
 399         struct timeval          cfe_last_event; /* To user space */
 400         struct timeval          cfe_last_action; /* From user space */
 401
 402         struct cfe_buf {
 403                 /*
 404                  * cfe_pending_q holds data that has been delivered to
 405                  * the filter and for which we are waiting for an action
 406                  */
 407                 struct cfil_queue       cfe_pending_q;
 408                 /*
 409                  * This queue is for data that has not be delivered to
 410                  * the content filter (new data, pass peek or flow control)
 411                  */
 412                 struct cfil_queue       cfe_ctl_q;
 413
 414                 uint64_t                cfe_pass_offset;
 415                 uint64_t                cfe_peek_offset;
 416                 uint64_t                cfe_peeked;
 417         } cfe_snd, cfe_rcv;
 418 };
 419
 420 #define CFEF_CFIL_ATTACHED              0x0001  /* was attached to filter */
 421 #define CFEF_SENT_SOCK_ATTACHED         0x0002  /* sock attach event was sent */
 422 #define CFEF_DATA_START                 0x0004  /* can send data event */
 423 #define CFEF_FLOW_CONTROLLED            0x0008  /* wait for flow control lift */
 424 #define CFEF_SENT_DISCONNECT_IN         0x0010  /* event was sent */
 425 #define CFEF_SENT_DISCONNECT_OUT        0x0020  /* event was sent */
 426 #define CFEF_SENT_SOCK_CLOSED           0x0040  /* closed event was sent */
 427 #define CFEF_CFIL_DETACHED              0x0080  /* filter was detached */
 428
 429
 430 #define CFI_ADD_TIME_LOG(cfil, t1, t0, op)                                                                                      \
 431                 struct timeval _tdiff;                                                                                          \
 432                 if ((cfil)->cfi_op_list_ctr < CFI_MAX_TIME_LOG_ENTRY) {                                                         \
 433                         timersub(t1, t0, &_tdiff);                                                                              \
 434                         (cfil)->cfi_op_time[(cfil)->cfi_op_list_ctr] = (uint32_t)(_tdiff.tv_sec * 1000 + _tdiff.tv_usec / 1000);\
 435                         (cfil)->cfi_op_list[(cfil)->cfi_op_list_ctr] = (unsigned char)op;                                       \
 436                         (cfil)->cfi_op_list_ctr ++;                                                                             \
 437                 }
 438
 439 struct cfil_hash_entry;
 440
 441 /*
 442  * struct cfil_info
 443  *
 444  * There is a struct cfil_info per socket
 445  */
 446 struct cfil_info {
 447         TAILQ_ENTRY(cfil_info)  cfi_link;
 448         struct socket           *cfi_so;
 449         uint64_t                cfi_flags;
 450         uint64_t                cfi_sock_id;
 451         struct timeval64        cfi_first_event;
 452         uint32_t                cfi_op_list_ctr;
 453         uint32_t                cfi_op_time[CFI_MAX_TIME_LOG_ENTRY];    /* time interval in microseconds since first event */
 454         unsigned char           cfi_op_list[CFI_MAX_TIME_LOG_ENTRY];
 455
 456         struct cfi_buf {
 457                 /*
 458                  * cfi_pending_first and cfi_pending_last describe the total
 459                  * amount of data outstanding for all the filters on
 460                  * this socket and data in the flow queue
 461                  * cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
 462                  */
 463                 uint64_t                cfi_pending_first;
 464                 uint64_t                cfi_pending_last;
 465                 uint32_t                cfi_pending_mbcnt;
 466                 uint32_t                cfi_pending_mbnum;
 467                 uint32_t                cfi_tail_drop_cnt;
 468                 /*
 469                  * cfi_pass_offset is the minimum of all the filters
 470                  */
 471                 uint64_t                cfi_pass_offset;
 472                 /*
 473                  * cfi_inject_q holds data that needs to be re-injected
 474                  * into the socket after filtering and that can
 475                  * be queued because of flow control
 476                  */
 477                 struct cfil_queue       cfi_inject_q;
 478         } cfi_snd, cfi_rcv;
 479
 480         struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
 481         struct cfil_hash_entry *cfi_hash_entry;
 482 } __attribute__((aligned(8)));
 483
 484 #define CFIF_DROP               0x0001  /* drop action applied */
 485 #define CFIF_CLOSE_WAIT         0x0002  /* waiting for filter to close */
 486 #define CFIF_SOCK_CLOSED        0x0004  /* socket is closed */
 487 #define CFIF_RETRY_INJECT_IN    0x0010  /* inject in failed */
 488 #define CFIF_RETRY_INJECT_OUT   0x0020  /* inject out failed */
 489 #define CFIF_SHUT_WR            0x0040  /* shutdown write */
 490 #define CFIF_SHUT_RD            0x0080  /* shutdown read */
 491
 492 #define CFI_MASK_GENCNT         0xFFFFFFFF00000000      /* upper 32 bits */
 493 #define CFI_SHIFT_GENCNT        32
 494 #define CFI_MASK_FLOWHASH       0x00000000FFFFFFFF      /* lower 32 bits */
 495 #define CFI_SHIFT_FLOWHASH      0
 496
 497 TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 498
 499 #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
 500 #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
 501
 502 /*
 503  * UDP Socket Support
 504  */
 505 LIST_HEAD(cfilhashhead, cfil_hash_entry);
 506 #define CFILHASHSIZE 16
 507 #define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport))
 508 #define IS_UDP(so) (so && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
 509 #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
 510                                                                   ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
 511 #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
 512                                                                                           cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL)
 513 #define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353))
 514
 515 /*
 516  * UDP Garbage Collection:
 517  */
 518 static struct thread *cfil_udp_gc_thread;
 519 #define UDP_FLOW_GC_IDLE_TO          30  // Flow Idle Timeout in seconds
 520 #define UDP_FLOW_GC_ACTION_TO        10  // Flow Action Timeout (no action from user space) in seconds
 521 #define UDP_FLOW_GC_MAX_COUNT        100 // Max UDP flows to be handled per run
 522 #define UDP_FLOW_GC_RUN_INTERVAL_NSEC  (10 * NSEC_PER_SEC)  // GC wakes up every 10 seconds
 523
 524 /*
 525  * UDP flow queue thresholds
 526  */
 527 #define UDP_FLOW_GC_MBUF_CNT_MAX  (2 << MBSHIFT) // Max mbuf byte count in flow queue (2MB)
 528 #define UDP_FLOW_GC_MBUF_NUM_MAX  (UDP_FLOW_GC_MBUF_CNT_MAX >> MCLSHIFT) // Max mbuf count in flow queue (1K)
 529 #define UDP_FLOW_GC_MBUF_SHIFT    5             // Shift to get 1/32 of platform limits
 530 /*
 531  * UDP flow queue threshold globals:
 532  */
 533 static unsigned int cfil_udp_gc_mbuf_num_max = UDP_FLOW_GC_MBUF_NUM_MAX;
 534 static unsigned int cfil_udp_gc_mbuf_cnt_max = UDP_FLOW_GC_MBUF_CNT_MAX;
 535
 536 /*
 537  * struct cfil_hash_entry
 538  *
 539  * Hash entry for cfil_info
 540  */
 541 struct cfil_hash_entry {
 542         LIST_ENTRY(cfil_hash_entry)    cfentry_link;
 543         struct cfil_info               *cfentry_cfil;
 544         u_short cfentry_fport;
 545         u_short cfentry_lport;
 546         sa_family_t                    cfentry_family;
 547         u_int32_t                      cfentry_flowhash;
 548         u_int32_t                      cfentry_lastused;
 549         union {
 550                 /* foreign host table entry */
 551                 struct in_addr_4in6 addr46;
 552                 struct in6_addr addr6;
 553         } cfentry_faddr;
 554         union {
 555                 /* local host table entry */
 556                 struct in_addr_4in6 addr46;
 557                 struct in6_addr addr6;
 558         } cfentry_laddr;
 559 };
 560
 561 /*
 562  * struct cfil_db
 563  *
 564  * For each UDP socket, this is a hash table maintaining all cfil_info structs
 565  * keyed by the flow 4-tuples <lport,fport,laddr,faddr>.
 566  */
 567 struct cfil_db {
 568         struct socket       *cfdb_so;
 569         uint32_t            cfdb_count;       /* Number of total content filters */
 570         struct cfilhashhead *cfdb_hashbase;
 571         u_long              cfdb_hashmask;
 572         struct cfil_hash_entry *cfdb_only_entry;  /* Optimization for connected UDP */
 573 };
 574
 575 /*
 576  * CFIL specific mbuf tag:
 577  * Save state of socket at the point of data entry into cfil.
 578  * Use saved state for reinjection at protocol layer.
 579  */
 580 struct cfil_tag {
 581         union sockaddr_in_4_6 cfil_faddr;
 582         uint32_t cfil_so_state_change_cnt;
 583         short cfil_so_options;
 584 };
 585
 586 #define    CFIL_HASH_ENTRY_ZONE_NAME    "cfil_entry_hash"
 587 #define    CFIL_HASH_ENTRY_ZONE_MAX     1024
 588 static struct zone *cfil_hash_entry_zone = NULL;
 589
 590 #define    CFIL_DB_ZONE_NAME       "cfil_db"
 591 #define    CFIL_DB_ZONE_MAX        1024
 592 static struct zone *cfil_db_zone = NULL;
 593
 594 /*
 595  * Statistics
 596  */
 597
 598 struct cfil_stats cfil_stats;
 599
 600 /*
 601  * For troubleshooting
 602  */
 603 int cfil_log_level = LOG_ERR;
 604 int cfil_debug = 1;
 605
 606 // Debug controls added for selective debugging.
 607 // Disabled for production.  If enabled,
 608 // these will have performance impact
 609 #define LIFECYCLE_DEBUG 0
 610 #define VERDICT_DEBUG 0
 611 #define DATA_DEBUG 0
 612 #define SHOW_DEBUG 0
 613 #define GC_DEBUG 0
 614
 615 /*
 616  * Sysctls for logs and statistics
 617  */
 618 static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
 619     struct sysctl_req *);
 620 static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
 621     struct sysctl_req *);
 622
 623 SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "cfil");
 624
 625 SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
 626     &cfil_log_level, 0, "");
 627
 628 SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 629     &cfil_debug, 0, "");
 630
 631 SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD | CTLFLAG_LOCKED,
 632     &cfil_sock_attached_count, 0, "");
 633
 634 SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD | CTLFLAG_LOCKED,
 635     &cfil_active_count, 0, "");
 636
 637 SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW | CTLFLAG_LOCKED,
 638     &cfil_close_wait_timeout, 0, "");
 639
 640 static int cfil_sbtrim = 1;
 641 SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW | CTLFLAG_LOCKED,
 642     &cfil_sbtrim, 0, "");
 643
 644 SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD | CTLFLAG_LOCKED,
 645     0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat", "");
 646
 647 SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD | CTLFLAG_LOCKED,
 648     0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat", "");
 649
 650 SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_LOCKED,
 651     &cfil_stats, cfil_stats, "");
 652
 653 /*
 654  * Forward declaration to appease the compiler
 655  */
 656 static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t, int,
 657     uint64_t, uint64_t);
 658 static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t);
 659 static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
 660 static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int);
 661 static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *,
 662     struct mbuf *, struct mbuf *, uint32_t);
 663 static int cfil_data_filter(struct socket *, struct cfil_info *, uint32_t, int,
 664     struct mbuf *, uint64_t);
 665 static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
 666     struct in_addr, u_int16_t);
 667 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
 668     struct in6_addr *, u_int16_t);
 669 ;
 670 static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t);
 671 static void cfil_info_free(struct cfil_info *);
 672 static struct cfil_info * cfil_info_alloc(struct socket *, struct cfil_hash_entry *);
 673 static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *);
 674 static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t, bool);
 675 static struct socket * cfil_socket_from_client_uuid(uuid_t, bool *);
 676 static int cfil_service_pending_queue(struct socket *, struct cfil_info *, uint32_t, int);
 677 static int cfil_data_service_ctl_q(struct socket *, struct cfil_info *, uint32_t, int);
 678 static void cfil_info_verify(struct cfil_info *);
 679 static int cfil_update_data_offsets(struct socket *, struct cfil_info *, uint32_t, int,
 680     uint64_t, uint64_t);
 681 static int cfil_acquire_sockbuf(struct socket *, struct cfil_info *, int);
 682 static void cfil_release_sockbuf(struct socket *, int);
 683 static int cfil_filters_attached(struct socket *);
 684
 685 static void cfil_rw_lock_exclusive(lck_rw_t *);
 686 static void cfil_rw_unlock_exclusive(lck_rw_t *);
 687 static void cfil_rw_lock_shared(lck_rw_t *);
 688 static void cfil_rw_unlock_shared(lck_rw_t *);
 689 static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
 690 static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
 691
 692 static unsigned int cfil_data_length(struct mbuf *, int *, int *);
 693 static errno_t cfil_db_init(struct socket *);
 694 static void cfil_db_free(struct socket *so);
 695 struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
 696 struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t);
 697 struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
 698 void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *);
 699 struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *);
 700 struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t);
 701 static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *,
 702     struct mbuf *, struct mbuf *, uint32_t);
 703 static int32_t cfil_sock_udp_data_pending(struct sockbuf *, bool);
 704 static void cfil_sock_udp_is_closed(struct socket *);
 705 static int cfil_sock_udp_notify_shutdown(struct socket *, int, int, int);
 706 static int cfil_sock_udp_shutdown(struct socket *, int *);
 707 static void cfil_sock_udp_close_wait(struct socket *);
 708 static void cfil_sock_udp_buf_update(struct sockbuf *);
 709 static int cfil_filters_udp_attached(struct socket *, bool);
 710 static void cfil_get_flow_address_v6(struct cfil_hash_entry *, struct inpcb *,
 711     struct in6_addr **, struct in6_addr **,
 712     u_int16_t *, u_int16_t *);
 713 static void cfil_get_flow_address(struct cfil_hash_entry *, struct inpcb *,
 714     struct in_addr *, struct in_addr *,
 715     u_int16_t *, u_int16_t *);
 716 static void cfil_info_log(int, struct cfil_info *, const char *);
 717 void cfil_filter_show(u_int32_t);
 718 void cfil_info_show(void);
 719 bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t);
 720 bool cfil_info_action_timed_out(struct cfil_info *, int);
 721 bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
 722 struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *);
 723 static void cfil_udp_gc_thread_func(void *, wait_result_t);
 724 static void cfil_info_udp_expire(void *, wait_result_t);
 725
 726 bool check_port(struct sockaddr *, u_short);
 727
 728 /*
 729  * Content filter global read write lock
 730  */
 731
 732 static void
 733 cfil_rw_lock_exclusive(lck_rw_t *lck)
 734 {
 735         void *lr_saved;
 736
 737         lr_saved = __builtin_return_address(0);
 738
 739         lck_rw_lock_exclusive(lck);
 740
 741         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 742         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 743 }
 744
 745 static void
 746 cfil_rw_unlock_exclusive(lck_rw_t *lck)
 747 {
 748         void *lr_saved;
 749
 750         lr_saved = __builtin_return_address(0);
 751
 752         lck_rw_unlock_exclusive(lck);
 753
 754         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 755         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 756 }
 757
 758 static void
 759 cfil_rw_lock_shared(lck_rw_t *lck)
 760 {
 761         void *lr_saved;
 762
 763         lr_saved = __builtin_return_address(0);
 764
 765         lck_rw_lock_shared(lck);
 766
 767         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 768         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 769 }
 770
 771 static void
 772 cfil_rw_unlock_shared(lck_rw_t *lck)
 773 {
 774         void *lr_saved;
 775
 776         lr_saved = __builtin_return_address(0);
 777
 778         lck_rw_unlock_shared(lck);
 779
 780         cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 781         cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 782 }
 783
 784 static boolean_t
 785 cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
 786 {
 787         void *lr_saved;
 788         boolean_t upgraded;
 789
 790         lr_saved = __builtin_return_address(0);
 791
 792         upgraded = lck_rw_lock_shared_to_exclusive(lck);
 793         if (upgraded) {
 794                 cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
 795                 cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
 796         }
 797         return upgraded;
 798 }
 799
 800 static void
 801 cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
 802 {
 803         void *lr_saved;
 804
 805         lr_saved = __builtin_return_address(0);
 806
 807         lck_rw_lock_exclusive_to_shared(lck);
 808
 809         cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
 810         cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
 811 }
 812
 813 static void
 814 cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
 815 {
 816 #if !MACH_ASSERT
 817 #pragma unused(lck, exclusive)
 818 #endif
 819         LCK_RW_ASSERT(lck,
 820             exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
 821 }
 822
 823 /*
 824  * Return the number of bytes in the mbuf chain using the same
 825  * method as m_length() or sballoc()
 826  *
 827  * Returns data len - starting from PKT start
 828  * - retmbcnt - optional param to get total mbuf bytes in chain
 829  * - retmbnum - optional param to get number of mbufs in chain
 830  */
 831 static unsigned int
 832 cfil_data_length(struct mbuf *m, int *retmbcnt, int *retmbnum)
 833 {
 834         struct mbuf *m0;
 835         unsigned int pktlen = 0;
 836         int mbcnt;
 837         int mbnum;
 838
 839         // Locate the start of data
 840         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
 841                 if (m0->m_flags & M_PKTHDR) {
 842                         break;
 843                 }
 844         }
 845         if (m0 == NULL) {
 846                 CFIL_LOG(LOG_ERR, "cfil_data_length: no M_PKTHDR");
 847                 return 0;
 848         }
 849         m = m0;
 850
 851         if (retmbcnt == NULL && retmbnum == NULL) {
 852                 return m_length(m);
 853         }
 854
 855         pktlen = 0;
 856         mbcnt = 0;
 857         mbnum = 0;
 858         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
 859                 pktlen += m0->m_len;
 860                 mbnum++;
 861                 mbcnt += MSIZE;
 862                 if (m0->m_flags & M_EXT) {
 863                         mbcnt += m0->m_ext.ext_size;
 864                 }
 865         }
 866         if (retmbcnt) {
 867                 *retmbcnt = mbcnt;
 868         }
 869         if (retmbnum) {
 870                 *retmbnum = mbnum;
 871         }
 872         return pktlen;
 873 }
 874
 875 static struct mbuf *
 876 cfil_data_start(struct mbuf *m)
 877 {
 878         struct mbuf *m0;
 879
 880         // Locate the start of data
 881         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
 882                 if (m0->m_flags & M_PKTHDR) {
 883                         break;
 884                 }
 885         }
 886         return m0;
 887 }
 888
 889 /*
 890  * Common mbuf queue utilities
 891  */
 892
 893 static inline void
 894 cfil_queue_init(struct cfil_queue *cfq)
 895 {
 896         cfq->q_start = 0;
 897         cfq->q_end = 0;
 898         MBUFQ_INIT(&cfq->q_mq);
 899 }
 900
 901 static inline uint64_t
 902 cfil_queue_drain(struct cfil_queue *cfq)
 903 {
 904         uint64_t drained = cfq->q_start - cfq->q_end;
 905         cfq->q_start = 0;
 906         cfq->q_end = 0;
 907         MBUFQ_DRAIN(&cfq->q_mq);
 908
 909         return drained;
 910 }
 911
 912 /* Return 1 when empty, 0 otherwise */
 913 static inline int
 914 cfil_queue_empty(struct cfil_queue *cfq)
 915 {
 916         return MBUFQ_EMPTY(&cfq->q_mq);
 917 }
 918
 919 static inline uint64_t
 920 cfil_queue_offset_first(struct cfil_queue *cfq)
 921 {
 922         return cfq->q_start;
 923 }
 924
 925 static inline uint64_t
 926 cfil_queue_offset_last(struct cfil_queue *cfq)
 927 {
 928         return cfq->q_end;
 929 }
 930
 931 static inline uint64_t
 932 cfil_queue_len(struct cfil_queue *cfq)
 933 {
 934         return cfq->q_end - cfq->q_start;
 935 }
 936
 937 /*
 938  * Routines to verify some fundamental assumptions
 939  */
 940
 941 static void
 942 cfil_queue_verify(struct cfil_queue *cfq)
 943 {
 944         mbuf_t chain;
 945         mbuf_t m;
 946         mbuf_t n;
 947         uint64_t queuesize = 0;
 948
 949         /* Verify offset are ordered */
 950         VERIFY(cfq->q_start <= cfq->q_end);
 951
 952         /*
 953          * When queue is empty, the offsets are equal otherwise the offsets
 954          * are different
 955          */
 956         VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
 957             (!MBUFQ_EMPTY(&cfq->q_mq) &&
 958             cfq->q_start != cfq->q_end));
 959
 960         MBUFQ_FOREACH(chain, &cfq->q_mq) {
 961                 size_t chainsize = 0;
 962                 m = chain;
 963                 unsigned int mlen = cfil_data_length(m, NULL, NULL);
 964                 // skip the addr and control stuff if present
 965                 m = cfil_data_start(m);
 966
 967                 if (m == NULL ||
 968                     m == (void *)M_TAG_FREE_PATTERN ||
 969                     m->m_next == (void *)M_TAG_FREE_PATTERN ||
 970                     m->m_nextpkt == (void *)M_TAG_FREE_PATTERN) {
 971                         panic("%s - mq %p is free at %p", __func__,
 972                             &cfq->q_mq, m);
 973                 }
 974                 for (n = m; n != NULL; n = n->m_next) {
 975                         if (n->m_type != MT_DATA &&
 976                             n->m_type != MT_HEADER &&
 977                             n->m_type != MT_OOBDATA) {
 978                                 panic("%s - %p unsupported type %u", __func__,
 979                                     n, n->m_type);
 980                         }
 981                         chainsize += n->m_len;
 982                 }
 983                 if (mlen != chainsize) {
 984                         panic("%s - %p m_length() %u != chainsize %lu",
 985                             __func__, m, mlen, chainsize);
 986                 }
 987                 queuesize += chainsize;
 988         }
 989         if (queuesize != cfq->q_end - cfq->q_start) {
 990                 panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
 991                     m, queuesize, cfq->q_end - cfq->q_start);
 992         }
 993 }
 994
 995 static void
 996 cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
 997 {
 998         CFIL_QUEUE_VERIFY(cfq);
 999
1000         MBUFQ_ENQUEUE(&cfq->q_mq, m);
1001         cfq->q_end += len;
1002
1003         CFIL_QUEUE_VERIFY(cfq);
1004 }
1005
1006 static void
1007 cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
1008 {
1009         CFIL_QUEUE_VERIFY(cfq);
1010
1011         VERIFY(cfil_data_length(m, NULL, NULL) == len);
1012
1013         MBUFQ_REMOVE(&cfq->q_mq, m);
1014         MBUFQ_NEXT(m) = NULL;
1015         cfq->q_start += len;
1016
1017         CFIL_QUEUE_VERIFY(cfq);
1018 }
1019
1020 static mbuf_t
1021 cfil_queue_first(struct cfil_queue *cfq)
1022 {
1023         return MBUFQ_FIRST(&cfq->q_mq);
1024 }
1025
1026 static mbuf_t
1027 cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
1028 {
1029 #pragma unused(cfq)
1030         return MBUFQ_NEXT(m);
1031 }
1032
1033 static void
1034 cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
1035 {
1036         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
1037         CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
1038
1039         /* Verify the queues are ordered so that pending is before ctl */
1040         VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
1041
1042         /* The peek offset cannot be less than the pass offset */
1043         VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
1044
1045         /* Make sure we've updated the offset we peeked at  */
1046         VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
1047 }
1048
1049 static void
1050 cfil_entry_verify(struct cfil_entry *entry)
1051 {
1052         cfil_entry_buf_verify(&entry->cfe_snd);
1053         cfil_entry_buf_verify(&entry->cfe_rcv);
1054 }
1055
1056 static void
1057 cfil_info_buf_verify(struct cfi_buf *cfi_buf)
1058 {
1059         CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
1060
1061         VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
1062         VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
1063 }
1064
1065 static void
1066 cfil_info_verify(struct cfil_info *cfil_info)
1067 {
1068         int i;
1069
1070         if (cfil_info == NULL) {
1071                 return;
1072         }
1073
1074         cfil_info_buf_verify(&cfil_info->cfi_snd);
1075         cfil_info_buf_verify(&cfil_info->cfi_rcv);
1076
1077         for (i = 0; i < MAX_CONTENT_FILTER; i++) {
1078                 cfil_entry_verify(&cfil_info->cfi_entries[i]);
1079         }
1080 }
1081
1082 static void
1083 verify_content_filter(struct content_filter *cfc)
1084 {
1085         struct cfil_entry *entry;
1086         uint32_t count = 0;
1087
1088         VERIFY(cfc->cf_sock_count >= 0);
1089
1090         TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
1091                 count++;
1092                 VERIFY(cfc == entry->cfe_filter);
1093         }
1094         VERIFY(count == cfc->cf_sock_count);
1095 }
1096
1097 /*
1098  * Kernel control socket callbacks
1099  */
1100 static errno_t
1101 cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
1102     void **unitinfo)
1103 {
1104         errno_t error = 0;
1105         struct content_filter *cfc = NULL;
1106
1107         CFIL_LOG(LOG_NOTICE, "");
1108
1109         cfc = zalloc(content_filter_zone);
1110         if (cfc == NULL) {
1111                 CFIL_LOG(LOG_ERR, "zalloc failed");
1112                 error = ENOMEM;
1113                 goto done;
1114         }
1115         bzero(cfc, sizeof(struct content_filter));
1116
1117         cfil_rw_lock_exclusive(&cfil_lck_rw);
1118         if (content_filters == NULL) {
1119                 struct content_filter **tmp;
1120
1121                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
1122
1123                 MALLOC(tmp,
1124                     struct content_filter **,
1125                     MAX_CONTENT_FILTER * sizeof(struct content_filter *),
1126                     M_TEMP,
1127                     M_WAITOK | M_ZERO);
1128
1129                 cfil_rw_lock_exclusive(&cfil_lck_rw);
1130
1131                 if (tmp == NULL && content_filters == NULL) {
1132                         error = ENOMEM;
1133                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1134                         goto done;
1135                 }
1136                 /* Another thread may have won the race */
1137                 if (content_filters != NULL) {
1138                         FREE(tmp, M_TEMP);
1139                 } else {
1140                         content_filters = tmp;
1141                 }
1142         }
1143
1144         if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
1145                 CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
1146                 error = EINVAL;
1147         } else if (content_filters[sac->sc_unit - 1] != NULL) {
1148                 CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
1149                 error = EADDRINUSE;
1150         } else {
1151                 /*
1152                  * kernel control socket kcunit numbers start at 1
1153                  */
1154                 content_filters[sac->sc_unit - 1] = cfc;
1155
1156                 cfc->cf_kcref = kctlref;
1157                 cfc->cf_kcunit = sac->sc_unit;
1158                 TAILQ_INIT(&cfc->cf_sock_entries);
1159
1160                 *unitinfo = cfc;
1161                 cfil_active_count++;
1162         }
1163         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1164 done:
1165         if (error != 0 && cfc != NULL) {
1166                 zfree(content_filter_zone, cfc);
1167         }
1168
1169         if (error == 0) {
1170                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
1171         } else {
1172                 OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
1173         }
1174
1175         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1176             error, cfil_active_count, sac->sc_unit);
1177
1178         return error;
1179 }
1180
1181 static errno_t
1182 cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
1183 {
1184 #pragma unused(kctlref)
1185         errno_t error = 0;
1186         struct content_filter *cfc;
1187         struct cfil_entry *entry;
1188         uint64_t sock_flow_id = 0;
1189
1190         CFIL_LOG(LOG_NOTICE, "");
1191
1192         if (content_filters == NULL) {
1193                 CFIL_LOG(LOG_ERR, "no content filter");
1194                 error = EINVAL;
1195                 goto done;
1196         }
1197         if (kcunit > MAX_CONTENT_FILTER) {
1198                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1199                     kcunit, MAX_CONTENT_FILTER);
1200                 error = EINVAL;
1201                 goto done;
1202         }
1203
1204         cfc = (struct content_filter *)unitinfo;
1205         if (cfc == NULL) {
1206                 goto done;
1207         }
1208
1209         cfil_rw_lock_exclusive(&cfil_lck_rw);
1210         if (content_filters[kcunit - 1] != cfc || cfc->cf_kcunit != kcunit) {
1211                 CFIL_LOG(LOG_ERR, "bad unit info %u)",
1212                     kcunit);
1213                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
1214                 goto done;
1215         }
1216         cfc->cf_flags |= CFF_DETACHING;
1217         /*
1218          * Remove all sockets from the filter
1219          */
1220         while ((entry = TAILQ_FIRST(&cfc->cf_sock_entries)) != NULL) {
1221                 cfil_rw_lock_assert_held(&cfil_lck_rw, 1);
1222
1223                 verify_content_filter(cfc);
1224                 /*
1225                  * Accept all outstanding data by pushing to next filter
1226                  * or back to socket
1227                  *
1228                  * TBD: Actually we should make sure all data has been pushed
1229                  * back to socket
1230                  */
1231                 if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
1232                         struct cfil_info *cfil_info = entry->cfe_cfil_info;
1233                         struct socket *so = cfil_info->cfi_so;
1234                         sock_flow_id = cfil_info->cfi_sock_id;
1235
1236                         /* Need to let data flow immediately */
1237                         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
1238                             CFEF_DATA_START;
1239
1240                         /*
1241                          * Respect locking hierarchy
1242                          */
1243                         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1244
1245                         socket_lock(so, 1);
1246
1247                         /*
1248                          * When cfe_filter is NULL the filter is detached
1249                          * and the entry has been removed from cf_sock_entries
1250                          */
1251                         if ((so->so_cfil == NULL && so->so_cfil_db == NULL) || entry->cfe_filter == NULL) {
1252                                 cfil_rw_lock_exclusive(&cfil_lck_rw);
1253                                 goto release;
1254                         }
1255
1256                         (void) cfil_action_data_pass(so, cfil_info, kcunit, 1,
1257                             CFM_MAX_OFFSET,
1258                             CFM_MAX_OFFSET);
1259
1260                         (void) cfil_action_data_pass(so, cfil_info, kcunit, 0,
1261                             CFM_MAX_OFFSET,
1262                             CFM_MAX_OFFSET);
1263
1264                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1265
1266                         /*
1267                          * Check again to make sure if the cfil_info is still valid
1268                          * as the socket may have been unlocked when when calling
1269                          * cfil_acquire_sockbuf()
1270                          */
1271                         if (entry->cfe_filter == NULL ||
1272                             (so->so_cfil == NULL && cfil_db_get_cfil_info(so->so_cfil_db, sock_flow_id) == NULL)) {
1273                                 goto release;
1274                         }
1275
1276                         /* The filter is now detached */
1277                         entry->cfe_flags |= CFEF_CFIL_DETACHED;
1278 #if LIFECYCLE_DEBUG
1279                         cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: LIFECYCLE: - FILTER DISCONNECTED");
1280 #endif
1281                         CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
1282                             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1283                         if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
1284                             cfil_filters_attached(so) == 0) {
1285                                 CFIL_LOG(LOG_NOTICE, "so %llx waking",
1286                                     (uint64_t)VM_KERNEL_ADDRPERM(so));
1287                                 wakeup((caddr_t)cfil_info);
1288                         }
1289
1290                         /*
1291                          * Remove the filter entry from the content filter
1292                          * but leave the rest of the state intact as the queues
1293                          * may not be empty yet
1294                          */
1295                         entry->cfe_filter = NULL;
1296                         entry->cfe_necp_control_unit = 0;
1297
1298                         TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1299                         cfc->cf_sock_count--;
1300 release:
1301                         socket_unlock(so, 1);
1302                 }
1303         }
1304         verify_content_filter(cfc);
1305
1306         VERIFY(cfc->cf_sock_count == 0);
1307
1308         /*
1309          * Make filter inactive
1310          */
1311         content_filters[kcunit - 1] = NULL;
1312         cfil_active_count--;
1313         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1314
1315         zfree(content_filter_zone, cfc);
1316 done:
1317         if (error == 0) {
1318                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_ok);
1319         } else {
1320                 OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_fail);
1321         }
1322
1323         CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1324             error, cfil_active_count, kcunit);
1325
1326         return error;
1327 }
1328
1329 /*
1330  * cfil_acquire_sockbuf()
1331  *
1332  * Prevent any other thread from acquiring the sockbuf
1333  * We use sb_cfil_thread as a semaphore to prevent other threads from
1334  * messing with the sockbuf -- see sblock()
1335  * Note: We do not set SB_LOCK here because the thread may check or modify
1336  * SB_LOCK several times until it calls cfil_release_sockbuf() -- currently
1337  * sblock(), sbunlock() or sodefunct()
1338  */
1339 static int
1340 cfil_acquire_sockbuf(struct socket *so, struct cfil_info *cfil_info, int outgoing)
1341 {
1342         thread_t tp = current_thread();
1343         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1344         lck_mtx_t *mutex_held;
1345         int error = 0;
1346
1347         /*
1348          * Wait until no thread is holding the sockbuf and other content
1349          * filter threads have released the sockbuf
1350          */
1351         while ((sb->sb_flags & SB_LOCK) ||
1352             (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)) {
1353                 if (so->so_proto->pr_getlock != NULL) {
1354                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1355                 } else {
1356                         mutex_held = so->so_proto->pr_domain->dom_mtx;
1357                 }
1358
1359                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1360
1361                 sb->sb_wantlock++;
1362                 VERIFY(sb->sb_wantlock != 0);
1363
1364                 msleep(&sb->sb_flags, mutex_held, PSOCK, "cfil_acquire_sockbuf",
1365                     NULL);
1366
1367                 VERIFY(sb->sb_wantlock != 0);
1368                 sb->sb_wantlock--;
1369         }
1370         /*
1371          * Use reference count for repetitive calls on same thread
1372          */
1373         if (sb->sb_cfil_refs == 0) {
1374                 VERIFY(sb->sb_cfil_thread == NULL);
1375                 VERIFY((sb->sb_flags & SB_LOCK) == 0);
1376
1377                 sb->sb_cfil_thread = tp;
1378                 sb->sb_flags |= SB_LOCK;
1379         }
1380         sb->sb_cfil_refs++;
1381
1382         /* We acquire the socket buffer when we need to cleanup */
1383         if (cfil_info == NULL) {
1384                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
1385                     (uint64_t)VM_KERNEL_ADDRPERM(so));
1386                 error = 0;
1387         } else if (cfil_info->cfi_flags & CFIF_DROP) {
1388                 CFIL_LOG(LOG_ERR, "so %llx drop set",
1389                     (uint64_t)VM_KERNEL_ADDRPERM(so));
1390                 error = EPIPE;
1391         }
1392
1393         return error;
1394 }
1395
1396 static void
1397 cfil_release_sockbuf(struct socket *so, int outgoing)
1398 {
1399         struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1400         thread_t tp = current_thread();
1401
1402         socket_lock_assert_owned(so);
1403
1404         if (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp) {
1405                 panic("%s sb_cfil_thread %p not current %p", __func__,
1406                     sb->sb_cfil_thread, tp);
1407         }
1408         /*
1409          * Don't panic if we are defunct because SB_LOCK has
1410          * been cleared by sodefunct()
1411          */
1412         if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK)) {
1413                 panic("%s SB_LOCK not set on %p", __func__,
1414                     sb);
1415         }
1416         /*
1417          * We can unlock when the thread unwinds to the last reference
1418          */
1419         sb->sb_cfil_refs--;
1420         if (sb->sb_cfil_refs == 0) {
1421                 sb->sb_cfil_thread = NULL;
1422                 sb->sb_flags &= ~SB_LOCK;
1423
1424                 if (sb->sb_wantlock > 0) {
1425                         wakeup(&sb->sb_flags);
1426                 }
1427         }
1428 }
1429
1430 cfil_sock_id_t
1431 cfil_sock_id_from_socket(struct socket *so)
1432 {
1433         if ((so->so_flags & SOF_CONTENT_FILTER) && so->so_cfil) {
1434                 return so->so_cfil->cfi_sock_id;
1435         } else {
1436                 return CFIL_SOCK_ID_NONE;
1437         }
1438 }
1439
1440 static bool
1441 cfil_socket_safe_lock(struct inpcb *inp)
1442 {
1443         if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1444                 socket_lock(inp->inp_socket, 1);
1445                 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) {
1446                         return true;
1447                 }
1448                 socket_unlock(inp->inp_socket, 1);
1449         }
1450         return false;
1451 }
1452
1453 static struct socket *
1454 cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only)
1455 {
1456         struct socket *so = NULL;
1457         u_int64_t gencnt = cfil_sock_id >> 32;
1458         u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
1459         struct inpcb *inp = NULL;
1460         struct inpcbinfo *pcbinfo = NULL;
1461
1462 #if VERDICT_DEBUG
1463         CFIL_LOG(LOG_ERR, "CFIL: VERDICT: search for socket: id %llu gencnt %llx flowhash %x", cfil_sock_id, gencnt, flowhash);
1464 #endif
1465
1466         if (udp_only) {
1467                 goto find_udp;
1468         }
1469
1470         pcbinfo = &tcbinfo;
1471         lck_rw_lock_shared(pcbinfo->ipi_lock);
1472         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1473                 if (inp->inp_state != INPCB_STATE_DEAD &&
1474                     inp->inp_socket != NULL &&
1475                     inp->inp_flowhash == flowhash &&
1476                     (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
1477                     inp->inp_socket->so_cfil != NULL) {
1478                         if (cfil_socket_safe_lock(inp)) {
1479                                 so = inp->inp_socket;
1480                         }
1481                         break;
1482                 }
1483         }
1484         lck_rw_done(pcbinfo->ipi_lock);
1485         if (so != NULL) {
1486                 goto done;
1487         }
1488
1489 find_udp:
1490
1491         pcbinfo = &udbinfo;
1492         lck_rw_lock_shared(pcbinfo->ipi_lock);
1493         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1494                 if (inp->inp_state != INPCB_STATE_DEAD &&
1495                     inp->inp_socket != NULL &&
1496                     inp->inp_socket->so_cfil_db != NULL &&
1497                     (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
1498                         if (cfil_socket_safe_lock(inp)) {
1499                                 so = inp->inp_socket;
1500                         }
1501                         break;
1502                 }
1503         }
1504         lck_rw_done(pcbinfo->ipi_lock);
1505
1506 done:
1507         if (so == NULL) {
1508                 OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
1509                 CFIL_LOG(LOG_DEBUG,
1510                     "no socket for sock_id %llx gencnt %llx flowhash %x",
1511                     cfil_sock_id, gencnt, flowhash);
1512         }
1513
1514         return so;
1515 }
1516
1517 static struct socket *
1518 cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
1519 {
1520         struct socket *so = NULL;
1521         struct inpcb *inp = NULL;
1522         struct inpcbinfo *pcbinfo = &tcbinfo;
1523
1524         lck_rw_lock_shared(pcbinfo->ipi_lock);
1525         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1526                 if (inp->inp_state != INPCB_STATE_DEAD &&
1527                     inp->inp_socket != NULL &&
1528                     uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
1529                         *cfil_attached = (inp->inp_socket->so_cfil != NULL);
1530                         if (cfil_socket_safe_lock(inp)) {
1531                                 so = inp->inp_socket;
1532                         }
1533                         break;
1534                 }
1535         }
1536         lck_rw_done(pcbinfo->ipi_lock);
1537         if (so != NULL) {
1538                 goto done;
1539         }
1540
1541         pcbinfo = &udbinfo;
1542         lck_rw_lock_shared(pcbinfo->ipi_lock);
1543         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1544                 if (inp->inp_state != INPCB_STATE_DEAD &&
1545                     inp->inp_socket != NULL &&
1546                     uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
1547                         *cfil_attached = (inp->inp_socket->so_cfil_db != NULL);
1548                         if (cfil_socket_safe_lock(inp)) {
1549                                 so = inp->inp_socket;
1550                         }
1551                         break;
1552                 }
1553         }
1554         lck_rw_done(pcbinfo->ipi_lock);
1555
1556 done:
1557         return so;
1558 }
1559
1560 static errno_t
1561 cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
1562     int flags)
1563 {
1564 #pragma unused(kctlref, flags)
1565         errno_t error = 0;
1566         struct cfil_msg_hdr *msghdr;
1567         struct content_filter *cfc = (struct content_filter *)unitinfo;
1568         struct socket *so;
1569         struct cfil_msg_action *action_msg;
1570         struct cfil_entry *entry;
1571         struct cfil_info *cfil_info = NULL;
1572
1573         CFIL_LOG(LOG_INFO, "");
1574
1575         if (content_filters == NULL) {
1576                 CFIL_LOG(LOG_ERR, "no content filter");
1577                 error = EINVAL;
1578                 goto done;
1579         }
1580         if (kcunit > MAX_CONTENT_FILTER) {
1581                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1582                     kcunit, MAX_CONTENT_FILTER);
1583                 error = EINVAL;
1584                 goto done;
1585         }
1586
1587         if (m_length(m) < sizeof(struct cfil_msg_hdr)) {
1588                 CFIL_LOG(LOG_ERR, "too short %u", m_length(m));
1589                 error = EINVAL;
1590                 goto done;
1591         }
1592         msghdr = (struct cfil_msg_hdr *)mbuf_data(m);
1593         if (msghdr->cfm_version != CFM_VERSION_CURRENT) {
1594                 CFIL_LOG(LOG_ERR, "bad version %u", msghdr->cfm_version);
1595                 error = EINVAL;
1596                 goto done;
1597         }
1598         if (msghdr->cfm_type != CFM_TYPE_ACTION) {
1599                 CFIL_LOG(LOG_ERR, "bad type %u", msghdr->cfm_type);
1600                 error = EINVAL;
1601                 goto done;
1602         }
1603         /* Validate action operation */
1604         switch (msghdr->cfm_op) {
1605         case CFM_OP_DATA_UPDATE:
1606                 OSIncrementAtomic(
1607                         &cfil_stats.cfs_ctl_action_data_update);
1608                 break;
1609         case CFM_OP_DROP:
1610                 OSIncrementAtomic(&cfil_stats.cfs_ctl_action_drop);
1611                 break;
1612         case CFM_OP_BLESS_CLIENT:
1613                 if (msghdr->cfm_len != sizeof(struct cfil_msg_bless_client)) {
1614                         OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1615                         error = EINVAL;
1616                         CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1617                             msghdr->cfm_len,
1618                             msghdr->cfm_op);
1619                         goto done;
1620                 }
1621                 error = cfil_action_bless_client(kcunit, msghdr);
1622                 goto done;
1623         default:
1624                 OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
1625                 CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
1626                 error = EINVAL;
1627                 goto done;
1628         }
1629         if (msghdr->cfm_len != sizeof(struct cfil_msg_action)) {
1630                 OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1631                 error = EINVAL;
1632                 CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1633                     msghdr->cfm_len,
1634                     msghdr->cfm_op);
1635                 goto done;
1636         }
1637         cfil_rw_lock_shared(&cfil_lck_rw);
1638         if (cfc != (void *)content_filters[kcunit - 1]) {
1639                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1640                     kcunit);
1641                 error = EINVAL;
1642                 cfil_rw_unlock_shared(&cfil_lck_rw);
1643                 goto done;
1644         }
1645         cfil_rw_unlock_shared(&cfil_lck_rw);
1646
1647         // Search for socket (TCP+UDP and lock so)
1648         so = cfil_socket_from_sock_id(msghdr->cfm_sock_id, false);
1649         if (so == NULL) {
1650                 CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
1651                     msghdr->cfm_sock_id);
1652                 error = EINVAL;
1653                 goto done;
1654         }
1655
1656         cfil_info = so->so_cfil_db != NULL ?
1657             cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil;
1658
1659         if (cfil_info == NULL) {
1660                 CFIL_LOG(LOG_NOTICE, "so %llx <id %llu> not attached",
1661                     (uint64_t)VM_KERNEL_ADDRPERM(so), msghdr->cfm_sock_id);
1662                 error = EINVAL;
1663                 goto unlock;
1664         } else if (cfil_info->cfi_flags & CFIF_DROP) {
1665                 CFIL_LOG(LOG_NOTICE, "so %llx drop set",
1666                     (uint64_t)VM_KERNEL_ADDRPERM(so));
1667                 error = EINVAL;
1668                 goto unlock;
1669         }
1670         entry = &cfil_info->cfi_entries[kcunit - 1];
1671         if (entry->cfe_filter == NULL) {
1672                 CFIL_LOG(LOG_NOTICE, "so %llx no filter",
1673                     (uint64_t)VM_KERNEL_ADDRPERM(so));
1674                 error = EINVAL;
1675                 goto unlock;
1676         }
1677
1678         if (entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) {
1679                 entry->cfe_flags |= CFEF_DATA_START;
1680         } else {
1681                 CFIL_LOG(LOG_ERR,
1682                     "so %llx attached not sent for %u",
1683                     (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1684                 error = EINVAL;
1685                 goto unlock;
1686         }
1687
1688         microuptime(&entry->cfe_last_action);
1689         CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_action, &cfil_info->cfi_first_event, msghdr->cfm_op);
1690
1691         action_msg = (struct cfil_msg_action *)msghdr;
1692
1693         switch (msghdr->cfm_op) {
1694         case CFM_OP_DATA_UPDATE:
1695 #if VERDICT_DEBUG
1696                 CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
1697                     (uint64_t)VM_KERNEL_ADDRPERM(so),
1698                     cfil_info->cfi_sock_id,
1699                     action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
1700                     action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
1701 #endif
1702                 if (action_msg->cfa_out_peek_offset != 0 ||
1703                     action_msg->cfa_out_pass_offset != 0) {
1704                         error = cfil_action_data_pass(so, cfil_info, kcunit, 1,
1705                             action_msg->cfa_out_pass_offset,
1706                             action_msg->cfa_out_peek_offset);
1707                 }
1708                 if (error == EJUSTRETURN) {
1709                         error = 0;
1710                 }
1711                 if (error != 0) {
1712                         break;
1713                 }
1714                 if (action_msg->cfa_in_peek_offset != 0 ||
1715                     action_msg->cfa_in_pass_offset != 0) {
1716                         error = cfil_action_data_pass(so, cfil_info, kcunit, 0,
1717                             action_msg->cfa_in_pass_offset,
1718                             action_msg->cfa_in_peek_offset);
1719                 }
1720                 if (error == EJUSTRETURN) {
1721                         error = 0;
1722                 }
1723                 break;
1724
1725         case CFM_OP_DROP:
1726                 error = cfil_action_drop(so, cfil_info, kcunit);
1727                 break;
1728
1729         default:
1730                 error = EINVAL;
1731                 break;
1732         }
1733 unlock:
1734         socket_unlock(so, 1);
1735 done:
1736         mbuf_freem(m);
1737
1738         if (error == 0) {
1739                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_ok);
1740         } else {
1741                 OSIncrementAtomic(&cfil_stats.cfs_ctl_send_bad);
1742         }
1743
1744         return error;
1745 }
1746
1747 static errno_t
1748 cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1749     int opt, void *data, size_t *len)
1750 {
1751 #pragma unused(kctlref, opt)
1752         struct cfil_info *cfil_info = NULL;
1753         errno_t error = 0;
1754         struct content_filter *cfc = (struct content_filter *)unitinfo;
1755
1756         CFIL_LOG(LOG_NOTICE, "");
1757
1758         cfil_rw_lock_shared(&cfil_lck_rw);
1759
1760         if (content_filters == NULL) {
1761                 CFIL_LOG(LOG_ERR, "no content filter");
1762                 error = EINVAL;
1763                 goto done;
1764         }
1765         if (kcunit > MAX_CONTENT_FILTER) {
1766                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1767                     kcunit, MAX_CONTENT_FILTER);
1768                 error = EINVAL;
1769                 goto done;
1770         }
1771         if (cfc != (void *)content_filters[kcunit - 1]) {
1772                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1773                     kcunit);
1774                 error = EINVAL;
1775                 goto done;
1776         }
1777         switch (opt) {
1778         case CFIL_OPT_NECP_CONTROL_UNIT:
1779                 if (*len < sizeof(uint32_t)) {
1780                         CFIL_LOG(LOG_ERR, "len too small %lu", *len);
1781                         error = EINVAL;
1782                         goto done;
1783                 }
1784                 if (data != NULL) {
1785                         *(uint32_t *)data = cfc->cf_necp_control_unit;
1786                 }
1787                 break;
1788         case CFIL_OPT_GET_SOCKET_INFO:
1789                 if (*len != sizeof(struct cfil_opt_sock_info)) {
1790                         CFIL_LOG(LOG_ERR, "len does not match %lu", *len);
1791                         error = EINVAL;
1792                         goto done;
1793                 }
1794                 if (data == NULL) {
1795                         CFIL_LOG(LOG_ERR, "data not passed");
1796                         error = EINVAL;
1797                         goto done;
1798                 }
1799
1800                 struct cfil_opt_sock_info *sock_info =
1801                     (struct cfil_opt_sock_info *) data;
1802
1803                 // Unlock here so that we never hold both cfil_lck_rw and the
1804                 // socket_lock at the same time. Otherwise, this can deadlock
1805                 // because soclose() takes the socket_lock and then exclusive
1806                 // cfil_lck_rw and we require the opposite order.
1807
1808                 // WARNING: Be sure to never use anything protected
1809                 //     by cfil_lck_rw beyond this point.
1810                 // WARNING: Be sure to avoid fallthrough and
1811                 //     goto return_already_unlocked from this branch.
1812                 cfil_rw_unlock_shared(&cfil_lck_rw);
1813
1814                 // Search (TCP+UDP) and lock socket
1815                 struct socket *sock =
1816                     cfil_socket_from_sock_id(sock_info->cfs_sock_id, false);
1817                 if (sock == NULL) {
1818 #if LIFECYCLE_DEBUG
1819                         CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: bad sock_id %llu",
1820                             sock_info->cfs_sock_id);
1821 #endif
1822                         error = ENOENT;
1823                         goto return_already_unlocked;
1824                 }
1825
1826                 cfil_info = (sock->so_cfil_db != NULL) ?
1827                     cfil_db_get_cfil_info(sock->so_cfil_db, sock_info->cfs_sock_id) : sock->so_cfil;
1828
1829                 if (cfil_info == NULL) {
1830 #if LIFECYCLE_DEBUG
1831                         CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: so %llx not attached, cannot fetch info",
1832                             (uint64_t)VM_KERNEL_ADDRPERM(sock));
1833 #endif
1834                         error = EINVAL;
1835                         socket_unlock(sock, 1);
1836                         goto return_already_unlocked;
1837                 }
1838
1839                 // Fill out family, type, and protocol
1840                 sock_info->cfs_sock_family = sock->so_proto->pr_domain->dom_family;
1841                 sock_info->cfs_sock_type = sock->so_proto->pr_type;
1842                 sock_info->cfs_sock_protocol = sock->so_proto->pr_protocol;
1843
1844                 // Source and destination addresses
1845                 struct inpcb *inp = sotoinpcb(sock);
1846                 if (inp->inp_vflag & INP_IPV6) {
1847                         struct in6_addr *laddr = NULL, *faddr = NULL;
1848                         u_int16_t lport = 0, fport = 0;
1849
1850                         cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
1851                             &laddr, &faddr, &lport, &fport);
1852                         fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
1853                         fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
1854                 } else if (inp->inp_vflag & INP_IPV4) {
1855                         struct in_addr laddr = {0}, faddr = {0};
1856                         u_int16_t lport = 0, fport = 0;
1857
1858                         cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
1859                             &laddr, &faddr, &lport, &fport);
1860                         fill_ip_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
1861                         fill_ip_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
1862                 }
1863
1864                 // Set the pid info
1865                 sock_info->cfs_pid = sock->last_pid;
1866                 memcpy(sock_info->cfs_uuid, sock->last_uuid, sizeof(uuid_t));
1867
1868                 if (sock->so_flags & SOF_DELEGATED) {
1869                         sock_info->cfs_e_pid = sock->e_pid;
1870                         memcpy(sock_info->cfs_e_uuid, sock->e_uuid, sizeof(uuid_t));
1871                 } else {
1872                         sock_info->cfs_e_pid = sock->last_pid;
1873                         memcpy(sock_info->cfs_e_uuid, sock->last_uuid, sizeof(uuid_t));
1874                 }
1875
1876                 socket_unlock(sock, 1);
1877
1878                 goto return_already_unlocked;
1879         default:
1880                 error = ENOPROTOOPT;
1881                 break;
1882         }
1883 done:
1884         cfil_rw_unlock_shared(&cfil_lck_rw);
1885
1886         return error;
1887
1888 return_already_unlocked:
1889
1890         return error;
1891 }
1892
1893 static errno_t
1894 cfil_ctl_setopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1895     int opt, void *data, size_t len)
1896 {
1897 #pragma unused(kctlref, opt)
1898         errno_t error = 0;
1899         struct content_filter *cfc = (struct content_filter *)unitinfo;
1900
1901         CFIL_LOG(LOG_NOTICE, "");
1902
1903         cfil_rw_lock_exclusive(&cfil_lck_rw);
1904
1905         if (content_filters == NULL) {
1906                 CFIL_LOG(LOG_ERR, "no content filter");
1907                 error = EINVAL;
1908                 goto done;
1909         }
1910         if (kcunit > MAX_CONTENT_FILTER) {
1911                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1912                     kcunit, MAX_CONTENT_FILTER);
1913                 error = EINVAL;
1914                 goto done;
1915         }
1916         if (cfc != (void *)content_filters[kcunit - 1]) {
1917                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1918                     kcunit);
1919                 error = EINVAL;
1920                 goto done;
1921         }
1922         switch (opt) {
1923         case CFIL_OPT_NECP_CONTROL_UNIT:
1924                 if (len < sizeof(uint32_t)) {
1925                         CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1926                             "len too small %lu", len);
1927                         error = EINVAL;
1928                         goto done;
1929                 }
1930                 if (cfc->cf_necp_control_unit != 0) {
1931                         CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1932                             "already set %u",
1933                             cfc->cf_necp_control_unit);
1934                         error = EINVAL;
1935                         goto done;
1936                 }
1937                 cfc->cf_necp_control_unit = *(uint32_t *)data;
1938                 break;
1939         default:
1940                 error = ENOPROTOOPT;
1941                 break;
1942         }
1943 done:
1944         cfil_rw_unlock_exclusive(&cfil_lck_rw);
1945
1946         return error;
1947 }
1948
1949
1950 static void
1951 cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
1952 {
1953 #pragma unused(kctlref, flags)
1954         struct content_filter *cfc = (struct content_filter *)unitinfo;
1955         struct socket *so = NULL;
1956         int error;
1957         struct cfil_entry *entry;
1958         struct cfil_info *cfil_info = NULL;
1959
1960         CFIL_LOG(LOG_INFO, "");
1961
1962         if (content_filters == NULL) {
1963                 CFIL_LOG(LOG_ERR, "no content filter");
1964                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1965                 return;
1966         }
1967         if (kcunit > MAX_CONTENT_FILTER) {
1968                 CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1969                     kcunit, MAX_CONTENT_FILTER);
1970                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1971                 return;
1972         }
1973         cfil_rw_lock_shared(&cfil_lck_rw);
1974         if (cfc != (void *)content_filters[kcunit - 1]) {
1975                 CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1976                     kcunit);
1977                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1978                 goto done;
1979         }
1980         /* Let's assume the flow control is lifted */
1981         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
1982                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
1983                         cfil_rw_lock_exclusive(&cfil_lck_rw);
1984                 }
1985
1986                 cfc->cf_flags &= ~CFF_FLOW_CONTROLLED;
1987
1988                 cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
1989                 LCK_RW_ASSERT(&cfil_lck_rw, LCK_RW_ASSERT_SHARED);
1990         }
1991         /*
1992          * Flow control will be raised again as soon as an entry cannot enqueue
1993          * to the kernel control socket
1994          */
1995         while ((cfc->cf_flags & CFF_FLOW_CONTROLLED) == 0) {
1996                 verify_content_filter(cfc);
1997
1998                 cfil_rw_lock_assert_held(&cfil_lck_rw, 0);
1999
2000                 /* Find an entry that is flow controlled */
2001                 TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
2002                         if (entry->cfe_cfil_info == NULL ||
2003                             entry->cfe_cfil_info->cfi_so == NULL) {
2004                                 continue;
2005                         }
2006                         if ((entry->cfe_flags & CFEF_FLOW_CONTROLLED) == 0) {
2007                                 continue;
2008                         }
2009                 }
2010                 if (entry == NULL) {
2011                         break;
2012                 }
2013
2014                 OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
2015
2016                 cfil_info = entry->cfe_cfil_info;
2017                 so = cfil_info->cfi_so;
2018
2019                 cfil_rw_unlock_shared(&cfil_lck_rw);
2020                 socket_lock(so, 1);
2021
2022                 do {
2023                         error = cfil_acquire_sockbuf(so, cfil_info, 1);
2024                         if (error == 0) {
2025                                 error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 1);
2026                         }
2027                         cfil_release_sockbuf(so, 1);
2028                         if (error != 0) {
2029                                 break;
2030                         }
2031
2032                         error = cfil_acquire_sockbuf(so, cfil_info, 0);
2033                         if (error == 0) {
2034                                 error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 0);
2035                         }
2036                         cfil_release_sockbuf(so, 0);
2037                 } while (0);
2038
2039                 socket_lock_assert_owned(so);
2040                 socket_unlock(so, 1);
2041
2042                 cfil_rw_lock_shared(&cfil_lck_rw);
2043         }
2044 done:
2045         cfil_rw_unlock_shared(&cfil_lck_rw);
2046 }
2047
2048 void
2049 cfil_init(void)
2050 {
2051         struct kern_ctl_reg kern_ctl;
2052         errno_t error = 0;
2053         vm_size_t content_filter_size = 0;      /* size of content_filter */
2054         vm_size_t cfil_info_size = 0;   /* size of cfil_info */
2055         vm_size_t cfil_hash_entry_size = 0; /* size of cfil_hash_entry */
2056         vm_size_t cfil_db_size = 0; /* size of cfil_db */
2057         unsigned int mbuf_limit = 0;
2058
2059         CFIL_LOG(LOG_NOTICE, "");
2060
2061         /*
2062          * Compile time verifications
2063          */
2064         _CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER);
2065         _CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0);
2066         _CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0);
2067         _CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0);
2068
2069         /*
2070          * Runtime time verifications
2071          */
2072         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_enqueued,
2073             sizeof(uint32_t)));
2074         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_enqueued,
2075             sizeof(uint32_t)));
2076         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_peeked,
2077             sizeof(uint32_t)));
2078         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_peeked,
2079             sizeof(uint32_t)));
2080
2081         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_in_enqueued,
2082             sizeof(uint32_t)));
2083         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_out_enqueued,
2084             sizeof(uint32_t)));
2085
2086         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_enqueued,
2087             sizeof(uint32_t)));
2088         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_enqueued,
2089             sizeof(uint32_t)));
2090         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_passed,
2091             sizeof(uint32_t)));
2092         VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_passed,
2093             sizeof(uint32_t)));
2094
2095         /*
2096          * Zone for content filters kernel control sockets
2097          */
2098         content_filter_size = sizeof(struct content_filter);
2099         content_filter_zone = zinit(content_filter_size,
2100             CONTENT_FILTER_ZONE_MAX * content_filter_size,
2101             0,
2102             CONTENT_FILTER_ZONE_NAME);
2103         if (content_filter_zone == NULL) {
2104                 panic("%s: zinit(%s) failed", __func__,
2105                     CONTENT_FILTER_ZONE_NAME);
2106                 /* NOTREACHED */
2107         }
2108         zone_change(content_filter_zone, Z_CALLERACCT, FALSE);
2109         zone_change(content_filter_zone, Z_EXPAND, TRUE);
2110
2111         /*
2112          * Zone for per socket content filters
2113          */
2114         cfil_info_size = sizeof(struct cfil_info);
2115         cfil_info_zone = zinit(cfil_info_size,
2116             CFIL_INFO_ZONE_MAX * cfil_info_size,
2117             0,
2118             CFIL_INFO_ZONE_NAME);
2119         if (cfil_info_zone == NULL) {
2120                 panic("%s: zinit(%s) failed", __func__, CFIL_INFO_ZONE_NAME);
2121                 /* NOTREACHED */
2122         }
2123         zone_change(cfil_info_zone, Z_CALLERACCT, FALSE);
2124         zone_change(cfil_info_zone, Z_EXPAND, TRUE);
2125
2126         /*
2127          * Zone for content filters cfil hash entries and db
2128          */
2129         cfil_hash_entry_size = sizeof(struct cfil_hash_entry);
2130         cfil_hash_entry_zone = zinit(cfil_hash_entry_size,
2131             CFIL_HASH_ENTRY_ZONE_MAX * cfil_hash_entry_size,
2132             0,
2133             CFIL_HASH_ENTRY_ZONE_NAME);
2134         if (cfil_hash_entry_zone == NULL) {
2135                 panic("%s: zinit(%s) failed", __func__, CFIL_HASH_ENTRY_ZONE_NAME);
2136                 /* NOTREACHED */
2137         }
2138         zone_change(cfil_hash_entry_zone, Z_CALLERACCT, FALSE);
2139         zone_change(cfil_hash_entry_zone, Z_EXPAND, TRUE);
2140
2141         cfil_db_size = sizeof(struct cfil_db);
2142         cfil_db_zone = zinit(cfil_db_size,
2143             CFIL_DB_ZONE_MAX * cfil_db_size,
2144             0,
2145             CFIL_DB_ZONE_NAME);
2146         if (cfil_db_zone == NULL) {
2147                 panic("%s: zinit(%s) failed", __func__, CFIL_DB_ZONE_NAME);
2148                 /* NOTREACHED */
2149         }
2150         zone_change(cfil_db_zone, Z_CALLERACCT, FALSE);
2151         zone_change(cfil_db_zone, Z_EXPAND, TRUE);
2152
2153         /*
2154          * Allocate locks
2155          */
2156         cfil_lck_grp_attr = lck_grp_attr_alloc_init();
2157         if (cfil_lck_grp_attr == NULL) {
2158                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
2159                 /* NOTREACHED */
2160         }
2161         cfil_lck_grp = lck_grp_alloc_init("content filter",
2162             cfil_lck_grp_attr);
2163         if (cfil_lck_grp == NULL) {
2164                 panic("%s: lck_grp_alloc_init failed", __func__);
2165                 /* NOTREACHED */
2166         }
2167         cfil_lck_attr = lck_attr_alloc_init();
2168         if (cfil_lck_attr == NULL) {
2169                 panic("%s: lck_attr_alloc_init failed", __func__);
2170                 /* NOTREACHED */
2171         }
2172         lck_rw_init(&cfil_lck_rw, cfil_lck_grp, cfil_lck_attr);
2173
2174         TAILQ_INIT(&cfil_sock_head);
2175
2176         /*
2177          * Register kernel control
2178          */
2179         bzero(&kern_ctl, sizeof(kern_ctl));
2180         strlcpy(kern_ctl.ctl_name, CONTENT_FILTER_CONTROL_NAME,
2181             sizeof(kern_ctl.ctl_name));
2182         kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_EXTENDED;
2183         kern_ctl.ctl_sendsize = 512 * 1024; /* enough? */
2184         kern_ctl.ctl_recvsize = 512 * 1024; /* enough? */
2185         kern_ctl.ctl_connect = cfil_ctl_connect;
2186         kern_ctl.ctl_disconnect = cfil_ctl_disconnect;
2187         kern_ctl.ctl_send = cfil_ctl_send;
2188         kern_ctl.ctl_getopt = cfil_ctl_getopt;
2189         kern_ctl.ctl_setopt = cfil_ctl_setopt;
2190         kern_ctl.ctl_rcvd = cfil_ctl_rcvd;
2191         error = ctl_register(&kern_ctl, &cfil_kctlref);
2192         if (error != 0) {
2193                 CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
2194                 return;
2195         }
2196
2197         // Spawn thread for gargage collection
2198         if (kernel_thread_start(cfil_udp_gc_thread_func, NULL,
2199             &cfil_udp_gc_thread) != KERN_SUCCESS) {
2200                 panic_plain("%s: Can't create UDP GC thread", __func__);
2201                 /* NOTREACHED */
2202         }
2203         /* this must not fail */
2204         VERIFY(cfil_udp_gc_thread != NULL);
2205
2206         // Set UDP per-flow mbuf thresholds to 1/32 of platform max
2207         mbuf_limit = MAX(UDP_FLOW_GC_MBUF_CNT_MAX, (nmbclusters << MCLSHIFT) >> UDP_FLOW_GC_MBUF_SHIFT);
2208         cfil_udp_gc_mbuf_num_max = (mbuf_limit >> MCLSHIFT);
2209         cfil_udp_gc_mbuf_cnt_max = mbuf_limit;
2210 }
2211
2212 struct cfil_info *
2213 cfil_info_alloc(struct socket *so, struct cfil_hash_entry *hash_entry)
2214 {
2215         int kcunit;
2216         struct cfil_info *cfil_info = NULL;
2217         struct inpcb *inp = sotoinpcb(so);
2218
2219         CFIL_LOG(LOG_INFO, "");
2220
2221         socket_lock_assert_owned(so);
2222
2223         cfil_info = zalloc(cfil_info_zone);
2224         if (cfil_info == NULL) {
2225                 goto done;
2226         }
2227         bzero(cfil_info, sizeof(struct cfil_info));
2228
2229         cfil_queue_init(&cfil_info->cfi_snd.cfi_inject_q);
2230         cfil_queue_init(&cfil_info->cfi_rcv.cfi_inject_q);
2231
2232         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2233                 struct cfil_entry *entry;
2234
2235                 entry = &cfil_info->cfi_entries[kcunit - 1];
2236                 entry->cfe_cfil_info = cfil_info;
2237
2238                 /* Initialize the filter entry */
2239                 entry->cfe_filter = NULL;
2240                 entry->cfe_flags = 0;
2241                 entry->cfe_necp_control_unit = 0;
2242                 entry->cfe_snd.cfe_pass_offset = 0;
2243                 entry->cfe_snd.cfe_peek_offset = 0;
2244                 entry->cfe_snd.cfe_peeked = 0;
2245                 entry->cfe_rcv.cfe_pass_offset = 0;
2246                 entry->cfe_rcv.cfe_peek_offset = 0;
2247                 entry->cfe_rcv.cfe_peeked = 0;
2248                 /*
2249                  * Timestamp the last action to avoid pre-maturely
2250                  * triggering garbage collection
2251                  */
2252                 microuptime(&entry->cfe_last_action);
2253
2254                 cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
2255                 cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
2256                 cfil_queue_init(&entry->cfe_snd.cfe_ctl_q);
2257                 cfil_queue_init(&entry->cfe_rcv.cfe_ctl_q);
2258         }
2259
2260         cfil_rw_lock_exclusive(&cfil_lck_rw);
2261
2262         /*
2263          * Create a cfi_sock_id that's not the socket pointer!
2264          */
2265
2266         if (hash_entry == NULL) {
2267                 // This is the TCP case, cfil_info is tracked per socket
2268                 if (inp->inp_flowhash == 0) {
2269                         inp->inp_flowhash = inp_calc_flowhash(inp);
2270                 }
2271
2272                 so->so_cfil = cfil_info;
2273                 cfil_info->cfi_so = so;
2274                 cfil_info->cfi_sock_id =
2275                     ((so->so_gencnt << 32) | inp->inp_flowhash);
2276         } else {
2277                 // This is the UDP case, cfil_info is tracked in per-socket hash
2278                 cfil_info->cfi_so = so;
2279                 hash_entry->cfentry_cfil = cfil_info;
2280                 cfil_info->cfi_hash_entry = hash_entry;
2281                 cfil_info->cfi_sock_id = ((so->so_gencnt << 32) | (hash_entry->cfentry_flowhash & 0xffffffff));
2282                 CFIL_LOG(LOG_DEBUG, "CFIL: UDP inp_flowhash %x so_gencnt %llx entry flowhash %x sockID %llx",
2283                     inp->inp_flowhash, so->so_gencnt, hash_entry->cfentry_flowhash, cfil_info->cfi_sock_id);
2284
2285                 // Wake up gc thread if this is first flow added
2286                 if (cfil_sock_udp_attached_count == 0) {
2287                         thread_wakeup((caddr_t)&cfil_sock_udp_attached_count);
2288                 }
2289
2290                 cfil_sock_udp_attached_count++;
2291         }
2292
2293         TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
2294
2295         cfil_sock_attached_count++;
2296
2297         cfil_rw_unlock_exclusive(&cfil_lck_rw);
2298
2299 done:
2300         if (cfil_info != NULL) {
2301                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_ok);
2302         } else {
2303                 OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_fail);
2304         }
2305
2306         return cfil_info;
2307 }
2308
2309 int
2310 cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cfil_info *cfil_info)
2311 {
2312         int kcunit;
2313         int attached = 0;
2314
2315         CFIL_LOG(LOG_INFO, "");
2316
2317         socket_lock_assert_owned(so);
2318
2319         cfil_rw_lock_exclusive(&cfil_lck_rw);
2320
2321         for (kcunit = 1;
2322             content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
2323             kcunit++) {
2324                 struct content_filter *cfc = content_filters[kcunit - 1];
2325                 struct cfil_entry *entry;
2326
2327                 if (cfc == NULL) {
2328                         continue;
2329                 }
2330                 if (cfc->cf_necp_control_unit != filter_control_unit) {
2331                         continue;
2332                 }
2333
2334                 entry = &cfil_info->cfi_entries[kcunit - 1];
2335
2336                 entry->cfe_filter = cfc;
2337                 entry->cfe_necp_control_unit = filter_control_unit;
2338                 TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
2339                 cfc->cf_sock_count++;
2340                 verify_content_filter(cfc);
2341                 attached = 1;
2342                 entry->cfe_flags |= CFEF_CFIL_ATTACHED;
2343                 break;
2344         }
2345
2346         cfil_rw_unlock_exclusive(&cfil_lck_rw);
2347
2348         return attached;
2349 }
2350
2351 static void
2352 cfil_info_free(struct cfil_info *cfil_info)
2353 {
2354         int kcunit;
2355         uint64_t in_drain = 0;
2356         uint64_t out_drained = 0;
2357
2358         if (cfil_info == NULL) {
2359                 return;
2360         }
2361
2362         CFIL_LOG(LOG_INFO, "");
2363
2364         cfil_rw_lock_exclusive(&cfil_lck_rw);
2365
2366         for (kcunit = 1;
2367             content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
2368             kcunit++) {
2369                 struct cfil_entry *entry;
2370                 struct content_filter *cfc;
2371
2372                 entry = &cfil_info->cfi_entries[kcunit - 1];
2373
2374                 /* Don't be silly and try to detach twice */
2375                 if (entry->cfe_filter == NULL) {
2376                         continue;
2377                 }
2378
2379                 cfc = content_filters[kcunit - 1];
2380
2381                 VERIFY(cfc == entry->cfe_filter);
2382
2383                 entry->cfe_filter = NULL;
2384                 entry->cfe_necp_control_unit = 0;
2385                 TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
2386                 cfc->cf_sock_count--;
2387
2388                 verify_content_filter(cfc);
2389         }
2390         if (cfil_info->cfi_hash_entry != NULL) {
2391                 cfil_sock_udp_attached_count--;
2392         }
2393         cfil_sock_attached_count--;
2394         TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
2395
2396         out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
2397         in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
2398
2399         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2400                 struct cfil_entry *entry;
2401
2402                 entry = &cfil_info->cfi_entries[kcunit - 1];
2403                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
2404                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_pending_q);
2405                 out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
2406                 in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_ctl_q);
2407         }
2408         cfil_rw_unlock_exclusive(&cfil_lck_rw);
2409
2410         if (out_drained) {
2411                 OSIncrementAtomic(&cfil_stats.cfs_flush_out_free);
2412         }
2413         if (in_drain) {
2414                 OSIncrementAtomic(&cfil_stats.cfs_flush_in_free);
2415         }
2416
2417         zfree(cfil_info_zone, cfil_info);
2418 }
2419
2420 /*
2421  * Entry point from Sockets layer
2422  * The socket is locked.
2423  */
2424 errno_t
2425 cfil_sock_attach(struct socket *so)
2426 {
2427         errno_t error = 0;
2428         uint32_t filter_control_unit;
2429
2430         socket_lock_assert_owned(so);
2431
2432         /* Limit ourselves to TCP that are not MPTCP subflows */
2433         if ((so->so_proto->pr_domain->dom_family != PF_INET &&
2434             so->so_proto->pr_domain->dom_family != PF_INET6) ||
2435             so->so_proto->pr_type != SOCK_STREAM ||
2436             so->so_proto->pr_protocol != IPPROTO_TCP ||
2437             (so->so_flags & SOF_MP_SUBFLOW) != 0 ||
2438             (so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0) {
2439                 goto done;
2440         }
2441
2442         filter_control_unit = necp_socket_get_content_filter_control_unit(so);
2443         if (filter_control_unit == 0) {
2444                 goto done;
2445         }
2446
2447         if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
2448                 OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
2449                 goto done;
2450         }
2451         if (cfil_active_count == 0) {
2452                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
2453                 goto done;
2454         }
2455         if (so->so_cfil != NULL) {
2456                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
2457                 CFIL_LOG(LOG_ERR, "already attached");
2458         } else {
2459                 cfil_info_alloc(so, NULL);
2460                 if (so->so_cfil == NULL) {
2461                         error = ENOMEM;
2462                         OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
2463                         goto done;
2464                 }
2465         }
2466         if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) {
2467                 CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
2468                     filter_control_unit);
2469                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
2470                 goto done;
2471         }
2472         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockID %llx",
2473             (uint64_t)VM_KERNEL_ADDRPERM(so),
2474             filter_control_unit, so->so_cfil->cfi_sock_id);
2475
2476         so->so_flags |= SOF_CONTENT_FILTER;
2477         OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
2478
2479         /* Hold a reference on the socket */
2480         so->so_usecount++;
2481
2482         error = cfil_dispatch_attach_event(so, so->so_cfil, filter_control_unit);
2483         /* We can recover from flow control or out of memory errors */
2484         if (error == ENOBUFS || error == ENOMEM) {
2485                 error = 0;
2486         } else if (error != 0) {
2487                 goto done;
2488         }
2489
2490         CFIL_INFO_VERIFY(so->so_cfil);
2491 done:
2492         return error;
2493 }
2494
2495 /*
2496  * Entry point from Sockets layer
2497  * The socket is locked.
2498  */
2499 errno_t
2500 cfil_sock_detach(struct socket *so)
2501 {
2502         if (IS_UDP(so)) {
2503                 cfil_db_free(so);
2504                 return 0;
2505         }
2506
2507         if (so->so_cfil) {
2508                 if (so->so_flags & SOF_CONTENT_FILTER) {
2509                         so->so_flags &= ~SOF_CONTENT_FILTER;
2510                         VERIFY(so->so_usecount > 0);
2511                         so->so_usecount--;
2512                 }
2513                 cfil_info_free(so->so_cfil);
2514                 so->so_cfil = NULL;
2515                 OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
2516         }
2517         return 0;
2518 }
2519
2520 static int
2521 cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint32_t filter_control_unit)
2522 {
2523         errno_t error = 0;
2524         struct cfil_entry *entry = NULL;
2525         struct cfil_msg_sock_attached msg_attached;
2526         uint32_t kcunit;
2527         struct content_filter *cfc = NULL;
2528
2529         socket_lock_assert_owned(so);
2530
2531         cfil_rw_lock_shared(&cfil_lck_rw);
2532
2533         if (so->so_proto == NULL || so->so_proto->pr_domain == NULL) {
2534                 error = EINVAL;
2535                 goto done;
2536         }
2537         /*
2538          * Find the matching filter unit
2539          */
2540         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2541                 cfc = content_filters[kcunit - 1];
2542
2543                 if (cfc == NULL) {
2544                         continue;
2545                 }
2546                 if (cfc->cf_necp_control_unit != filter_control_unit) {
2547                         continue;
2548                 }
2549                 entry = &cfil_info->cfi_entries[kcunit - 1];
2550                 if (entry->cfe_filter == NULL) {
2551                         continue;
2552                 }
2553
2554                 VERIFY(cfc == entry->cfe_filter);
2555
2556                 break;
2557         }
2558
2559         if (entry == NULL || entry->cfe_filter == NULL) {
2560                 goto done;
2561         }
2562
2563         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED)) {
2564                 goto done;
2565         }
2566
2567         CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
2568             (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit);
2569
2570         /* Would be wasteful to try when flow controlled */
2571         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2572                 error = ENOBUFS;
2573                 goto done;
2574         }
2575
2576         bzero(&msg_attached, sizeof(struct cfil_msg_sock_attached));
2577         msg_attached.cfs_msghdr.cfm_len = sizeof(struct cfil_msg_sock_attached);
2578         msg_attached.cfs_msghdr.cfm_version = CFM_VERSION_CURRENT;
2579         msg_attached.cfs_msghdr.cfm_type = CFM_TYPE_EVENT;
2580         msg_attached.cfs_msghdr.cfm_op = CFM_OP_SOCKET_ATTACHED;
2581         msg_attached.cfs_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2582
2583         msg_attached.cfs_sock_family = so->so_proto->pr_domain->dom_family;
2584         msg_attached.cfs_sock_type = so->so_proto->pr_type;
2585         msg_attached.cfs_sock_protocol = so->so_proto->pr_protocol;
2586         msg_attached.cfs_pid = so->last_pid;
2587         memcpy(msg_attached.cfs_uuid, so->last_uuid, sizeof(uuid_t));
2588         if (so->so_flags & SOF_DELEGATED) {
2589                 msg_attached.cfs_e_pid = so->e_pid;
2590                 memcpy(msg_attached.cfs_e_uuid, so->e_uuid, sizeof(uuid_t));
2591         } else {
2592                 msg_attached.cfs_e_pid = so->last_pid;
2593                 memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
2594         }
2595
2596 #if LIFECYCLE_DEBUG
2597         CFIL_LOG(LOG_DEBUG, "CFIL: LIFECYCLE: SENDING ATTACH UP <sockID %llu> ",
2598             entry->cfe_cfil_info->cfi_sock_id);
2599 #endif
2600
2601         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2602             entry->cfe_filter->cf_kcunit,
2603             &msg_attached,
2604             sizeof(struct cfil_msg_sock_attached),
2605             CTL_DATA_EOR);
2606         if (error != 0) {
2607                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error);
2608                 goto done;
2609         }
2610         microuptime(&entry->cfe_last_event);
2611         cfil_info->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec;
2612         cfil_info->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec;
2613
2614         entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
2615         OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
2616 done:
2617
2618         /* We can recover from flow control */
2619         if (error == ENOBUFS) {
2620                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2621                 OSIncrementAtomic(&cfil_stats.cfs_attach_event_flow_control);
2622
2623                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
2624                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2625                 }
2626
2627                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2628
2629                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2630         } else {
2631                 if (error != 0) {
2632                         OSIncrementAtomic(&cfil_stats.cfs_attach_event_fail);
2633                 }
2634
2635                 cfil_rw_unlock_shared(&cfil_lck_rw);
2636         }
2637         return error;
2638 }
2639
2640 static int
2641 cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
2642 {
2643         errno_t error = 0;
2644         struct mbuf *msg = NULL;
2645         struct cfil_entry *entry;
2646         struct cfe_buf *entrybuf;
2647         struct cfil_msg_hdr msg_disconnected;
2648         struct content_filter *cfc;
2649
2650         socket_lock_assert_owned(so);
2651
2652         cfil_rw_lock_shared(&cfil_lck_rw);
2653
2654         entry = &cfil_info->cfi_entries[kcunit - 1];
2655         if (outgoing) {
2656                 entrybuf = &entry->cfe_snd;
2657         } else {
2658                 entrybuf = &entry->cfe_rcv;
2659         }
2660
2661         cfc = entry->cfe_filter;
2662         if (cfc == NULL) {
2663                 goto done;
2664         }
2665
2666         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2667             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2668
2669         /*
2670          * Send the disconnection event once
2671          */
2672         if ((outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) ||
2673             (!outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))) {
2674                 CFIL_LOG(LOG_INFO, "so %llx disconnect already sent",
2675                     (uint64_t)VM_KERNEL_ADDRPERM(so));
2676                 goto done;
2677         }
2678
2679         /*
2680          * We're not disconnected as long as some data is waiting
2681          * to be delivered to the filter
2682          */
2683         if (outgoing && cfil_queue_empty(&entrybuf->cfe_ctl_q) == 0) {
2684                 CFIL_LOG(LOG_INFO, "so %llx control queue not empty",
2685                     (uint64_t)VM_KERNEL_ADDRPERM(so));
2686                 error = EBUSY;
2687                 goto done;
2688         }
2689         /* Would be wasteful to try when flow controlled */
2690         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2691                 error = ENOBUFS;
2692                 goto done;
2693         }
2694
2695 #if LIFECYCLE_DEBUG
2696         cfil_info_log(LOG_ERR, cfil_info, outgoing ?
2697             "CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP":
2698             "CFIL: LIFECYCLE: IN - SENDING DISCONNECT UP");
2699 #endif
2700
2701         bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
2702         msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
2703         msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
2704         msg_disconnected.cfm_type = CFM_TYPE_EVENT;
2705         msg_disconnected.cfm_op = outgoing ? CFM_OP_DISCONNECT_OUT :
2706             CFM_OP_DISCONNECT_IN;
2707         msg_disconnected.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2708         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2709             entry->cfe_filter->cf_kcunit,
2710             &msg_disconnected,
2711             sizeof(struct cfil_msg_hdr),
2712             CTL_DATA_EOR);
2713         if (error != 0) {
2714                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2715                 mbuf_freem(msg);
2716                 goto done;
2717         }
2718         microuptime(&entry->cfe_last_event);
2719         CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, msg_disconnected.cfm_op);
2720
2721         /* Remember we have sent the disconnection message */
2722         if (outgoing) {
2723                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_OUT;
2724                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_out_event_ok);
2725         } else {
2726                 entry->cfe_flags |= CFEF_SENT_DISCONNECT_IN;
2727                 OSIncrementAtomic(&cfil_stats.cfs_disconnect_in_event_ok);
2728         }
2729 done:
2730         if (error == ENOBUFS) {
2731                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2732                 OSIncrementAtomic(
2733                         &cfil_stats.cfs_disconnect_event_flow_control);
2734
2735                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
2736                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2737                 }
2738
2739                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2740
2741                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2742         } else {
2743                 if (error != 0) {
2744                         OSIncrementAtomic(
2745                                 &cfil_stats.cfs_disconnect_event_fail);
2746                 }
2747
2748                 cfil_rw_unlock_shared(&cfil_lck_rw);
2749         }
2750         return error;
2751 }
2752
2753 int
2754 cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int kcunit)
2755 {
2756         struct cfil_entry *entry;
2757         struct cfil_msg_sock_closed msg_closed;
2758         errno_t error = 0;
2759         struct content_filter *cfc;
2760
2761         socket_lock_assert_owned(so);
2762
2763         cfil_rw_lock_shared(&cfil_lck_rw);
2764
2765         entry = &cfil_info->cfi_entries[kcunit - 1];
2766         cfc = entry->cfe_filter;
2767         if (cfc == NULL) {
2768                 goto done;
2769         }
2770
2771         CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
2772             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
2773
2774         /* Would be wasteful to try when flow controlled */
2775         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2776                 error = ENOBUFS;
2777                 goto done;
2778         }
2779         /*
2780          * Send a single closed message per filter
2781          */
2782         if ((entry->cfe_flags & CFEF_SENT_SOCK_CLOSED) != 0) {
2783                 goto done;
2784         }
2785         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
2786                 goto done;
2787         }
2788
2789         microuptime(&entry->cfe_last_event);
2790         CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, CFM_OP_SOCKET_CLOSED);
2791
2792         bzero(&msg_closed, sizeof(struct cfil_msg_sock_closed));
2793         msg_closed.cfc_msghdr.cfm_len = sizeof(struct cfil_msg_sock_closed);
2794         msg_closed.cfc_msghdr.cfm_version = CFM_VERSION_CURRENT;
2795         msg_closed.cfc_msghdr.cfm_type = CFM_TYPE_EVENT;
2796         msg_closed.cfc_msghdr.cfm_op = CFM_OP_SOCKET_CLOSED;
2797         msg_closed.cfc_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2798         msg_closed.cfc_first_event.tv_sec = cfil_info->cfi_first_event.tv_sec;
2799         msg_closed.cfc_first_event.tv_usec = cfil_info->cfi_first_event.tv_usec;
2800         memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t) * CFI_MAX_TIME_LOG_ENTRY);
2801         memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char) * CFI_MAX_TIME_LOG_ENTRY);
2802         msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr;
2803
2804 #if LIFECYCLE_DEBUG
2805         CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: <sock id %llu> op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec);
2806 #endif
2807         /* for debugging
2808          *  if (msg_closed.cfc_op_list_ctr > CFI_MAX_TIME_LOG_ENTRY) {
2809          *       msg_closed.cfc_op_list_ctr  = CFI_MAX_TIME_LOG_ENTRY;       // just in case
2810          *  }
2811          *  for (unsigned int i = 0; i < msg_closed.cfc_op_list_ctr ; i++) {
2812          *       CFIL_LOG(LOG_ERR, "MD: socket %llu event %2u, time + %u msec", msg_closed.cfc_msghdr.cfm_sock_id, (unsigned short)msg_closed.cfc_op_list[i], msg_closed.cfc_op_time[i]);
2813          *  }
2814          */
2815
2816         error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2817             entry->cfe_filter->cf_kcunit,
2818             &msg_closed,
2819             sizeof(struct cfil_msg_sock_closed),
2820             CTL_DATA_EOR);
2821         if (error != 0) {
2822                 CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d",
2823                     error);
2824                 goto done;
2825         }
2826
2827         entry->cfe_flags |= CFEF_SENT_SOCK_CLOSED;
2828         OSIncrementAtomic(&cfil_stats.cfs_closed_event_ok);
2829 done:
2830         /* We can recover from flow control */
2831         if (error == ENOBUFS) {
2832                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2833                 OSIncrementAtomic(&cfil_stats.cfs_closed_event_flow_control);
2834
2835                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
2836                         cfil_rw_lock_exclusive(&cfil_lck_rw);
2837                 }
2838
2839                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2840
2841                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
2842         } else {
2843                 if (error != 0) {
2844                         OSIncrementAtomic(&cfil_stats.cfs_closed_event_fail);
2845                 }
2846
2847                 cfil_rw_unlock_shared(&cfil_lck_rw);
2848         }
2849
2850         return error;
2851 }
2852
2853 static void
2854 fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2855     struct in6_addr *ip6, u_int16_t port)
2856 {
2857         struct sockaddr_in6 *sin6 = &sin46->sin6;
2858
2859         sin6->sin6_family = AF_INET6;
2860         sin6->sin6_len = sizeof(*sin6);
2861         sin6->sin6_port = port;
2862         sin6->sin6_addr = *ip6;
2863         if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) {
2864                 sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]);
2865                 sin6->sin6_addr.s6_addr16[1] = 0;
2866         }
2867 }
2868
2869 static void
2870 fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2871     struct in_addr ip, u_int16_t port)
2872 {
2873         struct sockaddr_in *sin = &sin46->sin;
2874
2875         sin->sin_family = AF_INET;
2876         sin->sin_len = sizeof(*sin);
2877         sin->sin_port = port;
2878         sin->sin_addr.s_addr = ip.s_addr;
2879 }
2880
2881 static void
2882 cfil_get_flow_address_v6(struct cfil_hash_entry *entry, struct inpcb *inp,
2883     struct in6_addr **laddr, struct in6_addr **faddr,
2884     u_int16_t *lport, u_int16_t *fport)
2885 {
2886         if (entry != NULL) {
2887                 *laddr = &entry->cfentry_laddr.addr6;
2888                 *faddr = &entry->cfentry_faddr.addr6;
2889                 *lport = entry->cfentry_lport;
2890                 *fport = entry->cfentry_fport;
2891         } else {
2892                 *laddr = &inp->in6p_laddr;
2893                 *faddr = &inp->in6p_faddr;
2894                 *lport = inp->inp_lport;
2895                 *fport = inp->inp_fport;
2896         }
2897 }
2898
2899 static void
2900 cfil_get_flow_address(struct cfil_hash_entry *entry, struct inpcb *inp,
2901     struct in_addr *laddr, struct in_addr *faddr,
2902     u_int16_t *lport, u_int16_t *fport)
2903 {
2904         if (entry != NULL) {
2905                 *laddr = entry->cfentry_laddr.addr46.ia46_addr4;
2906                 *faddr = entry->cfentry_faddr.addr46.ia46_addr4;
2907                 *lport = entry->cfentry_lport;
2908                 *fport = entry->cfentry_fport;
2909         } else {
2910                 *laddr = inp->inp_laddr;
2911                 *faddr = inp->inp_faddr;
2912                 *lport = inp->inp_lport;
2913                 *fport = inp->inp_fport;
2914         }
2915 }
2916
2917 static int
2918 cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
2919     struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
2920 {
2921         errno_t error = 0;
2922         struct mbuf *copy = NULL;
2923         struct mbuf *msg = NULL;
2924         unsigned int one = 1;
2925         struct cfil_msg_data_event *data_req;
2926         size_t hdrsize;
2927         struct inpcb *inp = (struct inpcb *)so->so_pcb;
2928         struct cfil_entry *entry;
2929         struct cfe_buf *entrybuf;
2930         struct content_filter *cfc;
2931         struct timeval tv;
2932
2933         cfil_rw_lock_shared(&cfil_lck_rw);
2934
2935         entry = &cfil_info->cfi_entries[kcunit - 1];
2936         if (outgoing) {
2937                 entrybuf = &entry->cfe_snd;
2938         } else {
2939                 entrybuf = &entry->cfe_rcv;
2940         }
2941
2942         cfc = entry->cfe_filter;
2943         if (cfc == NULL) {
2944                 goto done;
2945         }
2946
2947         data = cfil_data_start(data);
2948         if (data == NULL || (data->m_flags & M_PKTHDR) == 0) {
2949                 CFIL_LOG(LOG_ERR, "NOT PKTHDR");
2950                 goto done;
2951         }
2952
2953         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2954             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2955
2956         socket_lock_assert_owned(so);
2957
2958         /* Would be wasteful to try */
2959         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2960                 error = ENOBUFS;
2961                 goto done;
2962         }
2963
2964         /* Make a copy of the data to pass to kernel control socket */
2965         copy = m_copym_mode(data, copyoffset, copylen, M_DONTWAIT,
2966             M_COPYM_NOOP_HDR);
2967         if (copy == NULL) {
2968                 CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2969                 error = ENOMEM;
2970                 goto done;
2971         }
2972
2973         /* We need an mbuf packet for the message header */
2974         hdrsize = sizeof(struct cfil_msg_data_event);
2975         error = mbuf_allocpacket(MBUF_DONTWAIT, hdrsize, &one, &msg);
2976         if (error != 0) {
2977                 CFIL_LOG(LOG_ERR, "mbuf_allocpacket() failed");
2978                 m_freem(copy);
2979                 /*
2980                  * ENOBUFS is to indicate flow control
2981                  */
2982                 error = ENOMEM;
2983                 goto done;
2984         }
2985         mbuf_setlen(msg, hdrsize);
2986         mbuf_pkthdr_setlen(msg, hdrsize + copylen);
2987         msg->m_next = copy;
2988         data_req = (struct cfil_msg_data_event *)mbuf_data(msg);
2989         bzero(data_req, hdrsize);
2990         data_req->cfd_msghdr.cfm_len = hdrsize + copylen;
2991         data_req->cfd_msghdr.cfm_version = 1;
2992         data_req->cfd_msghdr.cfm_type = CFM_TYPE_EVENT;
2993         data_req->cfd_msghdr.cfm_op =
2994             outgoing ? CFM_OP_DATA_OUT : CFM_OP_DATA_IN;
2995         data_req->cfd_msghdr.cfm_sock_id =
2996             entry->cfe_cfil_info->cfi_sock_id;
2997         data_req->cfd_start_offset = entrybuf->cfe_peeked;
2998         data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
2999
3000         /*
3001          * TBD:
3002          * For non connected sockets need to copy addresses from passed
3003          * parameters
3004          */
3005         if (inp->inp_vflag & INP_IPV6) {
3006                 struct in6_addr *laddr = NULL, *faddr = NULL;
3007                 u_int16_t lport = 0, fport = 0;
3008
3009                 cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
3010                     &laddr, &faddr, &lport, &fport);
3011                 if (outgoing) {
3012                         fill_ip6_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
3013                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
3014                 } else {
3015                         fill_ip6_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
3016                         fill_ip6_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
3017                 }
3018         } else if (inp->inp_vflag & INP_IPV4) {
3019                 struct in_addr laddr = {0}, faddr = {0};
3020                 u_int16_t lport = 0, fport = 0;
3021
3022                 cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
3023                     &laddr, &faddr, &lport, &fport);
3024
3025                 if (outgoing) {
3026                         fill_ip_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
3027                         fill_ip_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
3028                 } else {
3029                         fill_ip_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
3030                         fill_ip_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
3031                 }
3032         }
3033
3034         microuptime(&tv);
3035         CFI_ADD_TIME_LOG(cfil_info, &tv, &cfil_info->cfi_first_event, data_req->cfd_msghdr.cfm_op);
3036
3037         /* Pass the message to the content filter */
3038         error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
3039             entry->cfe_filter->cf_kcunit,
3040             msg, CTL_DATA_EOR);
3041         if (error != 0) {
3042                 CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
3043                 mbuf_freem(msg);
3044                 goto done;
3045         }
3046         entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
3047         OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
3048
3049 #if VERDICT_DEBUG
3050         CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u",
3051             (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen);
3052 #endif
3053
3054 done:
3055         if (error == ENOBUFS) {
3056                 entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
3057                 OSIncrementAtomic(
3058                         &cfil_stats.cfs_data_event_flow_control);
3059
3060                 if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) {
3061                         cfil_rw_lock_exclusive(&cfil_lck_rw);
3062                 }
3063
3064                 cfc->cf_flags |= CFF_FLOW_CONTROLLED;
3065
3066                 cfil_rw_unlock_exclusive(&cfil_lck_rw);
3067         } else {
3068                 if (error != 0) {
3069                         OSIncrementAtomic(&cfil_stats.cfs_data_event_fail);
3070                 }
3071
3072                 cfil_rw_unlock_shared(&cfil_lck_rw);
3073         }
3074         return error;
3075 }
3076
3077 /*
3078  * Process the queue of data waiting to be delivered to content filter
3079  */
3080 static int
3081 cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
3082 {
3083         errno_t error = 0;
3084         struct mbuf *data, *tmp = NULL;
3085         unsigned int datalen = 0, copylen = 0, copyoffset = 0;
3086         struct cfil_entry *entry;
3087         struct cfe_buf *entrybuf;
3088         uint64_t currentoffset = 0;
3089
3090         if (cfil_info == NULL) {
3091                 return 0;
3092         }
3093
3094         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3095             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3096
3097         socket_lock_assert_owned(so);
3098
3099         entry = &cfil_info->cfi_entries[kcunit - 1];
3100         if (outgoing) {
3101                 entrybuf = &entry->cfe_snd;
3102         } else {
3103                 entrybuf = &entry->cfe_rcv;
3104         }
3105
3106         /* Send attached message if not yet done */
3107         if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
3108                 error = cfil_dispatch_attach_event(so, cfil_info, kcunit);
3109                 if (error != 0) {
3110                         /* We can recover from flow control */
3111                         if (error == ENOBUFS || error == ENOMEM) {
3112                                 error = 0;
3113                         }
3114                         goto done;
3115                 }
3116         } else if ((entry->cfe_flags & CFEF_DATA_START) == 0) {
3117                 OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
3118                 goto done;
3119         }
3120
3121 #if DATA_DEBUG
3122         CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE CTL-Q: pass_offset %llu peeked %llu peek_offset %llu",
3123             entrybuf->cfe_pass_offset,
3124             entrybuf->cfe_peeked,
3125             entrybuf->cfe_peek_offset);
3126 #endif
3127
3128         /* Move all data that can pass */
3129         while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
3130             entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
3131                 datalen = cfil_data_length(data, NULL, NULL);
3132                 tmp = data;
3133
3134                 if (entrybuf->cfe_ctl_q.q_start + datalen <=
3135                     entrybuf->cfe_pass_offset) {
3136                         /*
3137                          * The first mbuf can fully pass
3138                          */
3139                         copylen = datalen;
3140                 } else {
3141                         /*
3142                          * The first mbuf can partially pass
3143                          */
3144                         copylen = entrybuf->cfe_pass_offset -
3145                             entrybuf->cfe_ctl_q.q_start;
3146                 }
3147                 VERIFY(copylen <= datalen);
3148
3149 #if DATA_DEBUG
3150                 CFIL_LOG(LOG_DEBUG,
3151                     "CFIL: SERVICE CTL-Q PASSING: %llx first %llu peeked %llu pass %llu peek %llu"
3152                     "datalen %u copylen %u",
3153                     (uint64_t)VM_KERNEL_ADDRPERM(tmp),
3154                     entrybuf->cfe_ctl_q.q_start,
3155                     entrybuf->cfe_peeked,
3156                     entrybuf->cfe_pass_offset,
3157                     entrybuf->cfe_peek_offset,
3158                     datalen, copylen);
3159 #endif
3160
3161                 /*
3162                  * Data that passes has been peeked at explicitly or
3163                  * implicitly
3164                  */
3165                 if (entrybuf->cfe_ctl_q.q_start + copylen >
3166                     entrybuf->cfe_peeked) {
3167                         entrybuf->cfe_peeked =
3168                             entrybuf->cfe_ctl_q.q_start + copylen;
3169                 }
3170                 /*
3171                  * Stop on partial pass
3172                  */
3173                 if (copylen < datalen) {
3174                         break;
3175                 }
3176
3177                 /* All good, move full data from ctl queue to pending queue */
3178                 cfil_queue_remove(&entrybuf->cfe_ctl_q, data, datalen);
3179
3180                 cfil_queue_enqueue(&entrybuf->cfe_pending_q, data, datalen);
3181                 if (outgoing) {
3182                         OSAddAtomic64(datalen,
3183                             &cfil_stats.cfs_pending_q_out_enqueued);
3184                 } else {
3185                         OSAddAtomic64(datalen,
3186                             &cfil_stats.cfs_pending_q_in_enqueued);
3187                 }
3188         }
3189         CFIL_INFO_VERIFY(cfil_info);
3190         if (tmp != NULL) {
3191                 CFIL_LOG(LOG_DEBUG,
3192                     "%llx first %llu peeked %llu pass %llu peek %llu"
3193                     "datalen %u copylen %u",
3194                     (uint64_t)VM_KERNEL_ADDRPERM(tmp),
3195                     entrybuf->cfe_ctl_q.q_start,
3196                     entrybuf->cfe_peeked,
3197                     entrybuf->cfe_pass_offset,
3198                     entrybuf->cfe_peek_offset,
3199                     datalen, copylen);
3200         }
3201         tmp = NULL;
3202
3203         /* Now deal with remaining data the filter wants to peek at */
3204         for (data = cfil_queue_first(&entrybuf->cfe_ctl_q),
3205             currentoffset = entrybuf->cfe_ctl_q.q_start;
3206             data != NULL && currentoffset < entrybuf->cfe_peek_offset;
3207             data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
3208             currentoffset += datalen) {
3209                 datalen = cfil_data_length(data, NULL, NULL);
3210                 tmp = data;
3211
3212                 /* We've already peeked at this mbuf */
3213                 if (currentoffset + datalen <= entrybuf->cfe_peeked) {
3214                         continue;
3215                 }
3216                 /*
3217                  * The data in the first mbuf may have been
3218                  * partially peeked at
3219                  */
3220                 copyoffset = entrybuf->cfe_peeked - currentoffset;
3221                 VERIFY(copyoffset < datalen);
3222                 copylen = datalen - copyoffset;
3223                 VERIFY(copylen <= datalen);
3224                 /*
3225                  * Do not copy more than needed
3226                  */
3227                 if (currentoffset + copyoffset + copylen >
3228                     entrybuf->cfe_peek_offset) {
3229                         copylen = entrybuf->cfe_peek_offset -
3230                             (currentoffset + copyoffset);
3231                 }
3232
3233 #if DATA_DEBUG
3234                 CFIL_LOG(LOG_DEBUG,
3235                     "CFIL: SERVICE CTL-Q PEEKING: %llx current %llu peeked %llu pass %llu peek %llu "
3236                     "datalen %u copylen %u copyoffset %u",
3237                     (uint64_t)VM_KERNEL_ADDRPERM(tmp),
3238                     currentoffset,
3239                     entrybuf->cfe_peeked,
3240                     entrybuf->cfe_pass_offset,
3241                     entrybuf->cfe_peek_offset,
3242                     datalen, copylen, copyoffset);
3243 #endif
3244
3245                 /*
3246                  * Stop if there is nothing more to peek at
3247                  */
3248                 if (copylen == 0) {
3249                         break;
3250                 }
3251                 /*
3252                  * Let the filter get a peek at this span of data
3253                  */
3254                 error = cfil_dispatch_data_event(so, cfil_info, kcunit,
3255                     outgoing, data, copyoffset, copylen);
3256                 if (error != 0) {
3257                         /* On error, leave data in ctl_q */
3258                         break;
3259                 }
3260                 entrybuf->cfe_peeked += copylen;
3261                 if (outgoing) {
3262                         OSAddAtomic64(copylen,
3263                             &cfil_stats.cfs_ctl_q_out_peeked);
3264                 } else {
3265                         OSAddAtomic64(copylen,
3266                             &cfil_stats.cfs_ctl_q_in_peeked);
3267                 }
3268
3269                 /* Stop when data could not be fully peeked at */
3270                 if (copylen + copyoffset < datalen) {
3271                         break;
3272                 }
3273         }
3274         CFIL_INFO_VERIFY(cfil_info);
3275         if (tmp != NULL) {
3276                 CFIL_LOG(LOG_DEBUG,
3277                     "%llx first %llu peeked %llu pass %llu peek %llu"
3278                     "datalen %u copylen %u copyoffset %u",
3279                     (uint64_t)VM_KERNEL_ADDRPERM(tmp),
3280                     currentoffset,
3281                     entrybuf->cfe_peeked,
3282                     entrybuf->cfe_pass_offset,
3283                     entrybuf->cfe_peek_offset,
3284                     datalen, copylen, copyoffset);
3285         }
3286
3287         /*
3288          * Process data that has passed the filter
3289          */
3290         error = cfil_service_pending_queue(so, cfil_info, kcunit, outgoing);
3291         if (error != 0) {
3292                 CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
3293                     error);
3294                 goto done;
3295         }
3296
3297         /*
3298          * Dispatch disconnect events that could not be sent
3299          */
3300         if (cfil_info == NULL) {
3301                 goto done;
3302         } else if (outgoing) {
3303                 if ((cfil_info->cfi_flags & CFIF_SHUT_WR) &&
3304                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) {
3305                         cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
3306                 }
3307         } else {
3308                 if ((cfil_info->cfi_flags & CFIF_SHUT_RD) &&
3309                     !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN)) {
3310                         cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
3311                 }
3312         }
3313
3314 done:
3315         CFIL_LOG(LOG_DEBUG,
3316             "first %llu peeked %llu pass %llu peek %llu",
3317             entrybuf->cfe_ctl_q.q_start,
3318             entrybuf->cfe_peeked,
3319             entrybuf->cfe_pass_offset,
3320             entrybuf->cfe_peek_offset);
3321
3322         CFIL_INFO_VERIFY(cfil_info);
3323         return error;
3324 }
3325
3326 /*
3327  * cfil_data_filter()
3328  *
3329  * Process data for a content filter installed on a socket
3330  */
3331 int
3332 cfil_data_filter(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
3333     struct mbuf *data, uint64_t datalen)
3334 {
3335         errno_t error = 0;
3336         struct cfil_entry *entry;
3337         struct cfe_buf *entrybuf;
3338
3339         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3340             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3341
3342         socket_lock_assert_owned(so);
3343
3344         entry = &cfil_info->cfi_entries[kcunit - 1];
3345         if (outgoing) {
3346                 entrybuf = &entry->cfe_snd;
3347         } else {
3348                 entrybuf = &entry->cfe_rcv;
3349         }
3350
3351         /* Are we attached to the filter? */
3352         if (entry->cfe_filter == NULL) {
3353                 error = 0;
3354                 goto done;
3355         }
3356
3357         /* Dispatch to filters */
3358         cfil_queue_enqueue(&entrybuf->cfe_ctl_q, data, datalen);
3359         if (outgoing) {
3360                 OSAddAtomic64(datalen,
3361                     &cfil_stats.cfs_ctl_q_out_enqueued);
3362         } else {
3363                 OSAddAtomic64(datalen,
3364                     &cfil_stats.cfs_ctl_q_in_enqueued);
3365         }
3366
3367         error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
3368         if (error != 0) {
3369                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
3370                     error);
3371         }
3372         /*
3373          * We have to return EJUSTRETURN in all cases to avoid double free
3374          * by socket layer
3375          */
3376         error = EJUSTRETURN;
3377 done:
3378         CFIL_INFO_VERIFY(cfil_info);
3379
3380         CFIL_LOG(LOG_INFO, "return %d", error);
3381         return error;
3382 }
3383
3384 /*
3385  * cfil_service_inject_queue() re-inject data that passed the
3386  * content filters
3387  */
3388 static int
3389 cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int outgoing)
3390 {
3391         mbuf_t data;
3392         unsigned int datalen;
3393         int mbcnt = 0;
3394         int mbnum = 0;
3395         errno_t error = 0;
3396         struct cfi_buf *cfi_buf;
3397         struct cfil_queue *inject_q;
3398         int need_rwakeup = 0;
3399         int count = 0;
3400
3401         if (cfil_info == NULL) {
3402                 return 0;
3403         }
3404
3405         socket_lock_assert_owned(so);
3406
3407         if (outgoing) {
3408                 cfi_buf = &cfil_info->cfi_snd;
3409                 cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
3410         } else {
3411                 cfi_buf = &cfil_info->cfi_rcv;
3412                 cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
3413         }
3414         inject_q = &cfi_buf->cfi_inject_q;
3415
3416         if (cfil_queue_empty(inject_q)) {
3417                 return 0;
3418         }
3419
3420 #if DATA_DEBUG | VERDICT_DEBUG
3421         CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> outgoing %d queue len %llu",
3422             (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, cfil_queue_len(inject_q));
3423 #endif
3424
3425         while ((data = cfil_queue_first(inject_q)) != NULL) {
3426                 datalen = cfil_data_length(data, &mbcnt, &mbnum);
3427
3428 #if DATA_DEBUG
3429                 CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: <so %llx> data %llx datalen %u (mbcnt %u)",
3430                     remote_addr_ptr ? "UNCONNECTED" : "CONNECTED",
3431                     (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
3432 #endif
3433
3434                 /* Remove data from queue and adjust stats */
3435                 cfil_queue_remove(inject_q, data, datalen);
3436                 cfi_buf->cfi_pending_first += datalen;
3437                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
3438                 cfi_buf->cfi_pending_mbnum -= mbnum;
3439                 cfil_info_buf_verify(cfi_buf);
3440
3441                 if (outgoing) {
3442                         error = sosend_reinject(so, NULL, data, NULL, 0);
3443                         if (error != 0) {
3444 #if DATA_DEBUG
3445                                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: Error: sosend_reinject() failed");
3446                                 CFIL_LOG(LOG_ERR, "### sosend() failed %d", error);
3447 #endif
3448                                 break;
3449                         }
3450                         // At least one injection succeeded, need to wake up pending threads.
3451                         need_rwakeup = 1;
3452                 } else {
3453                         data->m_flags |= M_SKIPCFIL;
3454
3455                         /*
3456                          * NOTE: We currently only support TCP and UDP.
3457                          * For RAWIP, MPTCP and message TCP we'll
3458                          * need to call the appropriate sbappendxxx()
3459                          * of fix sock_inject_data_in()
3460                          */
3461                         if (IS_UDP(so) == TRUE) {
3462                                 if (sbappendchain(&so->so_rcv, data, 0)) {
3463                                         need_rwakeup = 1;
3464                                 }
3465                         } else {
3466                                 if (sbappendstream(&so->so_rcv, data)) {
3467                                         need_rwakeup = 1;
3468                                 }
3469                         }
3470                 }
3471
3472                 if (outgoing) {
3473                         OSAddAtomic64(datalen,
3474                             &cfil_stats.cfs_inject_q_out_passed);
3475                 } else {
3476                         OSAddAtomic64(datalen,
3477                             &cfil_stats.cfs_inject_q_in_passed);
3478                 }
3479
3480                 count++;
3481         }
3482
3483 #if DATA_DEBUG | VERDICT_DEBUG
3484         CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
3485             (uint64_t)VM_KERNEL_ADDRPERM(so), count);
3486 #endif
3487
3488         /* A single wakeup is for several packets is more efficient */
3489         if (need_rwakeup) {
3490                 if (outgoing == TRUE) {
3491                         sowwakeup(so);
3492                 } else {
3493                         sorwakeup(so);
3494                 }
3495         }
3496
3497         if (error != 0 && cfil_info) {
3498                 if (error == ENOBUFS) {
3499                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
3500                 }
3501                 if (error == ENOMEM) {
3502                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
3503                 }
3504
3505                 if (outgoing) {
3506                         cfil_info->cfi_flags |= CFIF_RETRY_INJECT_OUT;
3507                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
3508                 } else {
3509                         cfil_info->cfi_flags |= CFIF_RETRY_INJECT_IN;
3510                         OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
3511                 }
3512         }
3513
3514         /*
3515          * Notify
3516          */
3517         if (cfil_info && (cfil_info->cfi_flags & CFIF_SHUT_WR)) {
3518                 cfil_sock_notify_shutdown(so, SHUT_WR);
3519                 if (cfil_sock_data_pending(&so->so_snd) == 0) {
3520                         soshutdownlock_final(so, SHUT_WR);
3521                 }
3522         }
3523         if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
3524                 if (cfil_filters_attached(so) == 0) {
3525                         CFIL_LOG(LOG_INFO, "so %llx waking",
3526                             (uint64_t)VM_KERNEL_ADDRPERM(so));
3527                         wakeup((caddr_t)cfil_info);
3528                 }
3529         }
3530
3531         CFIL_INFO_VERIFY(cfil_info);
3532
3533         return error;
3534 }
3535
3536 static int
3537 cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
3538 {
3539         uint64_t passlen, curlen;
3540         mbuf_t data;
3541         unsigned int datalen;
3542         errno_t error = 0;
3543         struct cfil_entry *entry;
3544         struct cfe_buf *entrybuf;
3545         struct cfil_queue *pending_q;
3546
3547         CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
3548             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
3549
3550         socket_lock_assert_owned(so);
3551
3552         entry = &cfil_info->cfi_entries[kcunit - 1];
3553         if (outgoing) {
3554                 entrybuf = &entry->cfe_snd;
3555         } else {
3556                 entrybuf = &entry->cfe_rcv;
3557         }
3558
3559         pending_q = &entrybuf->cfe_pending_q;
3560
3561         passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
3562
3563         /*
3564          * Locate the chunks of data that we can pass to the next filter
3565          * A data chunk must be on mbuf boundaries
3566          */
3567         curlen = 0;
3568         while ((data = cfil_queue_first(pending_q)) != NULL) {
3569                 datalen = cfil_data_length(data, NULL, NULL);
3570
3571 #if DATA_DEBUG
3572                 CFIL_LOG(LOG_DEBUG,
3573                     "CFIL: SERVICE PENDING-Q: data %llx datalen %u passlen %llu curlen %llu",
3574                     (uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
3575                     passlen, curlen);
3576 #endif
3577
3578                 if (curlen + datalen > passlen) {
3579                         break;
3580                 }
3581
3582                 cfil_queue_remove(pending_q, data, datalen);
3583
3584                 curlen += datalen;
3585
3586                 for (kcunit += 1;
3587                     kcunit <= MAX_CONTENT_FILTER;
3588                     kcunit++) {
3589                         error = cfil_data_filter(so, cfil_info, kcunit, outgoing,
3590                             data, datalen);
3591                         /* 0 means passed so we can continue */
3592                         if (error != 0) {
3593                                 break;
3594                         }
3595                 }
3596                 /* When data has passed all filters, re-inject */
3597                 if (error == 0) {
3598                         if (outgoing) {
3599                                 cfil_queue_enqueue(
3600                                         &cfil_info->cfi_snd.cfi_inject_q,
3601                                         data, datalen);
3602                                 OSAddAtomic64(datalen,
3603                                     &cfil_stats.cfs_inject_q_out_enqueued);
3604                         } else {
3605                                 cfil_queue_enqueue(
3606                                         &cfil_info->cfi_rcv.cfi_inject_q,
3607                                         data, datalen);
3608                                 OSAddAtomic64(datalen,
3609                                     &cfil_stats.cfs_inject_q_in_enqueued);
3610                         }
3611                 }
3612         }
3613
3614         CFIL_INFO_VERIFY(cfil_info);
3615
3616         return error;
3617 }
3618
3619 int
3620 cfil_update_data_offsets(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
3621     uint64_t pass_offset, uint64_t peek_offset)
3622 {
3623         errno_t error = 0;
3624         struct cfil_entry *entry = NULL;
3625         struct cfe_buf *entrybuf;
3626         int updated = 0;
3627
3628         CFIL_LOG(LOG_INFO, "pass %llu peek %llu", pass_offset, peek_offset);
3629
3630         socket_lock_assert_owned(so);
3631
3632         if (cfil_info == NULL) {
3633                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
3634                     (uint64_t)VM_KERNEL_ADDRPERM(so));
3635                 error = 0;
3636                 goto done;
3637         } else if (cfil_info->cfi_flags & CFIF_DROP) {
3638                 CFIL_LOG(LOG_ERR, "so %llx drop set",
3639                     (uint64_t)VM_KERNEL_ADDRPERM(so));
3640                 error = EPIPE;
3641                 goto done;
3642         }
3643
3644         entry = &cfil_info->cfi_entries[kcunit - 1];
3645         if (outgoing) {
3646                 entrybuf = &entry->cfe_snd;
3647         } else {
3648                 entrybuf = &entry->cfe_rcv;
3649         }
3650
3651         /* Record updated offsets for this content filter */
3652         if (pass_offset > entrybuf->cfe_pass_offset) {
3653                 entrybuf->cfe_pass_offset = pass_offset;
3654
3655                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset) {
3656                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
3657                 }
3658                 updated = 1;
3659         } else {
3660                 CFIL_LOG(LOG_INFO, "pass_offset %llu <= cfe_pass_offset %llu",
3661                     pass_offset, entrybuf->cfe_pass_offset);
3662         }
3663         /* Filter does not want or need to see data that's allowed to pass */
3664         if (peek_offset > entrybuf->cfe_pass_offset &&
3665             peek_offset > entrybuf->cfe_peek_offset) {
3666                 entrybuf->cfe_peek_offset = peek_offset;
3667                 updated = 1;
3668         }
3669         /* Nothing to do */
3670         if (updated == 0) {
3671                 goto done;
3672         }
3673
3674         /* Move data held in control queue to pending queue if needed */
3675         error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
3676         if (error != 0) {
3677                 CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
3678                     error);
3679                 goto done;
3680         }
3681         error = EJUSTRETURN;
3682
3683 done:
3684         /*
3685          * The filter is effectively detached when pass all from both sides
3686          * or when the socket is closed and no more data is waiting
3687          * to be delivered to the filter
3688          */
3689         if (entry != NULL &&
3690             ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
3691             entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
3692             ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
3693             cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
3694             cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
3695                 entry->cfe_flags |= CFEF_CFIL_DETACHED;
3696 #if LIFECYCLE_DEBUG
3697                 cfil_info_log(LOG_ERR, cfil_info, outgoing ?
3698                     "CFIL: LIFECYCLE: OUT - PASSED ALL - DETACH":
3699                     "CFIL: LIFECYCLE: IN - PASSED ALL - DETACH");
3700 #endif
3701                 CFIL_LOG(LOG_INFO, "so %llx detached %u",
3702                     (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3703                 if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
3704                     cfil_filters_attached(so) == 0) {
3705 #if LIFECYCLE_DEBUG
3706                         cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAKING");
3707 #endif
3708                         CFIL_LOG(LOG_INFO, "so %llx waking",
3709                             (uint64_t)VM_KERNEL_ADDRPERM(so));
3710                         wakeup((caddr_t)cfil_info);
3711                 }
3712         }
3713         CFIL_INFO_VERIFY(cfil_info);
3714         CFIL_LOG(LOG_INFO, "return %d", error);
3715         return error;
3716 }
3717
3718 /*
3719  * Update pass offset for socket when no data is pending
3720  */
3721 static int
3722 cfil_set_socket_pass_offset(struct socket *so, struct cfil_info *cfil_info, int outgoing)
3723 {
3724         struct cfi_buf *cfi_buf;
3725         struct cfil_entry *entry;
3726         struct cfe_buf *entrybuf;
3727         uint32_t kcunit;
3728         uint64_t pass_offset = 0;
3729
3730         if (cfil_info == NULL) {
3731                 return 0;
3732         }
3733
3734         CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
3735             (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3736
3737         socket_lock_assert_owned(so);
3738
3739         if (outgoing) {
3740                 cfi_buf = &cfil_info->cfi_snd;
3741         } else {
3742                 cfi_buf = &cfil_info->cfi_rcv;
3743         }
3744
3745         CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx, sockID %llu> outgoing %d cfi_pending_first %llu cfi_pending_last %llu",
3746             (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing,
3747             cfi_buf->cfi_pending_first, cfi_buf->cfi_pending_last);
3748
3749         if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
3750                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3751                         entry = &cfil_info->cfi_entries[kcunit - 1];
3752
3753                         /* Are we attached to a filter? */
3754                         if (entry->cfe_filter == NULL) {
3755                                 continue;
3756                         }
3757
3758                         if (outgoing) {
3759                                 entrybuf = &entry->cfe_snd;
3760                         } else {
3761                                 entrybuf = &entry->cfe_rcv;
3762                         }
3763
3764                         if (pass_offset == 0 ||
3765                             entrybuf->cfe_pass_offset < pass_offset) {
3766                                 pass_offset = entrybuf->cfe_pass_offset;
3767                         }
3768                 }
3769                 cfi_buf->cfi_pass_offset = pass_offset;
3770         }
3771
3772         CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx, sockID %llu>, cfi_pass_offset %llu",
3773             (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfi_buf->cfi_pass_offset);
3774
3775         return 0;
3776 }
3777
3778 int
3779 cfil_action_data_pass(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
3780     uint64_t pass_offset, uint64_t peek_offset)
3781 {
3782         errno_t error = 0;
3783
3784         CFIL_LOG(LOG_INFO, "");
3785
3786         socket_lock_assert_owned(so);
3787
3788         error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
3789         if (error != 0) {
3790                 CFIL_LOG(LOG_INFO, "so %llx %s dropped",
3791                     (uint64_t)VM_KERNEL_ADDRPERM(so),
3792                     outgoing ? "out" : "in");
3793                 goto release;
3794         }
3795
3796         error = cfil_update_data_offsets(so, cfil_info, kcunit, outgoing,
3797             pass_offset, peek_offset);
3798
3799         cfil_service_inject_queue(so, cfil_info, outgoing);
3800
3801         cfil_set_socket_pass_offset(so, cfil_info, outgoing);
3802 release:
3803         CFIL_INFO_VERIFY(cfil_info);
3804         cfil_release_sockbuf(so, outgoing);
3805
3806         return error;
3807 }
3808
3809
3810 static void
3811 cfil_flush_queues(struct socket *so, struct cfil_info *cfil_info)
3812 {
3813         struct cfil_entry *entry;
3814         int kcunit;
3815         uint64_t drained;
3816
3817         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) {
3818                 goto done;
3819         }
3820
3821         socket_lock_assert_owned(so);
3822
3823         /*
3824          * Flush the output queues and ignore errors as long as
3825          * we are attached
3826          */
3827         (void) cfil_acquire_sockbuf(so, cfil_info, 1);
3828         if (cfil_info != NULL) {
3829                 drained = 0;
3830                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3831                         entry = &cfil_info->cfi_entries[kcunit - 1];
3832
3833                         drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
3834                         drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
3835                 }
3836                 drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
3837
3838                 if (drained) {
3839                         if (cfil_info->cfi_flags & CFIF_DROP) {
3840                                 OSIncrementAtomic(
3841                                         &cfil_stats.cfs_flush_out_drop);
3842                         } else {
3843                                 OSIncrementAtomic(
3844                                         &cfil_stats.cfs_flush_out_close);
3845                         }
3846                 }
3847         }
3848         cfil_release_sockbuf(so, 1);
3849
3850         /*
3851          * Flush the input queues
3852          */
3853         (void) cfil_acquire_sockbuf(so, cfil_info, 0);
3854         if (cfil_info != NULL) {
3855                 drained = 0;
3856                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3857                         entry = &cfil_info->cfi_entries[kcunit - 1];
3858
3859                         drained += cfil_queue_drain(
3860                                 &entry->cfe_rcv.cfe_ctl_q);
3861                         drained += cfil_queue_drain(
3862                                 &entry->cfe_rcv.cfe_pending_q);
3863                 }
3864                 drained += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
3865
3866                 if (drained) {
3867                         if (cfil_info->cfi_flags & CFIF_DROP) {
3868                                 OSIncrementAtomic(
3869                                         &cfil_stats.cfs_flush_in_drop);
3870                         } else {
3871                                 OSIncrementAtomic(
3872                                         &cfil_stats.cfs_flush_in_close);
3873                         }
3874                 }
3875         }
3876         cfil_release_sockbuf(so, 0);
3877 done:
3878         CFIL_INFO_VERIFY(cfil_info);
3879 }
3880
3881 int
3882 cfil_action_drop(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit)
3883 {
3884         errno_t error = 0;
3885         struct cfil_entry *entry;
3886         struct proc *p;
3887
3888         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) {
3889                 goto done;
3890         }
3891
3892         socket_lock_assert_owned(so);
3893
3894         entry = &cfil_info->cfi_entries[kcunit - 1];
3895
3896         /* Are we attached to the filter? */
3897         if (entry->cfe_filter == NULL) {
3898                 goto done;
3899         }
3900
3901         cfil_info->cfi_flags |= CFIF_DROP;
3902
3903         p = current_proc();
3904
3905         /*
3906          * Force the socket to be marked defunct
3907          * (forcing fixed along with rdar://19391339)
3908          */
3909         if (so->so_cfil_db == NULL) {
3910                 error = sosetdefunct(p, so,
3911                     SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
3912                     FALSE);
3913
3914                 /* Flush the socket buffer and disconnect */
3915                 if (error == 0) {
3916                         error = sodefunct(p, so,
3917                             SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
3918                 }
3919         }
3920
3921         /* The filter is done, mark as detached */
3922         entry->cfe_flags |= CFEF_CFIL_DETACHED;
3923 #if LIFECYCLE_DEBUG
3924         cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: DROP - DETACH");
3925 #endif
3926         CFIL_LOG(LOG_INFO, "so %llx detached %u",
3927             (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3928
3929         /* Pending data needs to go */
3930         cfil_flush_queues(so, cfil_info);
3931
3932         if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
3933                 if (cfil_filters_attached(so) == 0) {
3934                         CFIL_LOG(LOG_INFO, "so %llx waking",
3935                             (uint64_t)VM_KERNEL_ADDRPERM(so));
3936                         wakeup((caddr_t)cfil_info);
3937                 }
3938         }
3939 done:
3940         return error;
3941 }
3942
3943 int
3944 cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
3945 {
3946         errno_t error = 0;
3947         struct cfil_info *cfil_info = NULL;
3948
3949         bool cfil_attached = false;
3950         struct cfil_msg_bless_client *blessmsg = (struct cfil_msg_bless_client *)msghdr;
3951
3952         // Search and lock socket
3953         struct socket *so = cfil_socket_from_client_uuid(blessmsg->cfb_client_uuid, &cfil_attached);
3954         if (so == NULL) {
3955                 error = ENOENT;
3956         } else {
3957                 // The client gets a pass automatically
3958                 cfil_info = (so->so_cfil_db != NULL) ?
3959                     cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil;
3960
3961                 if (cfil_attached) {
3962 #if VERDICT_DEBUG
3963                         if (cfil_info != NULL) {
3964                                 CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: BLESS %s <so %llx sockID %llu>",
3965                                     cfil_info->cfi_hash_entry ? "UDP" : "TCP",
3966                                     (uint64_t)VM_KERNEL_ADDRPERM(so),
3967                                     cfil_info->cfi_sock_id);
3968                         }
3969 #endif
3970                         (void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
3971                         (void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
3972                 } else {
3973                         so->so_flags1 |= SOF1_CONTENT_FILTER_SKIP;
3974                 }
3975                 socket_unlock(so, 1);
3976         }
3977
3978         return error;
3979 }
3980
3981 static int
3982 cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen)
3983 {
3984         struct cfil_entry *entry;
3985         struct cfe_buf *entrybuf;
3986         uint32_t kcunit;
3987
3988         CFIL_LOG(LOG_INFO, "so %llx outgoing %d datalen %u",
3989             (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
3990
3991         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3992                 entry = &cfil_info->cfi_entries[kcunit - 1];
3993
3994                 /* Are we attached to the filter? */
3995                 if (entry->cfe_filter == NULL) {
3996                         continue;
3997                 }
3998
3999                 if (outgoing) {
4000                         entrybuf = &entry->cfe_snd;
4001                 } else {
4002                         entrybuf = &entry->cfe_rcv;
4003                 }
4004
4005                 entrybuf->cfe_ctl_q.q_start += datalen;
4006                 entrybuf->cfe_pass_offset = entrybuf->cfe_ctl_q.q_start;
4007                 entrybuf->cfe_peeked = entrybuf->cfe_ctl_q.q_start;
4008                 if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset) {
4009                         entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
4010                 }
4011
4012                 entrybuf->cfe_ctl_q.q_end += datalen;
4013
4014                 entrybuf->cfe_pending_q.q_start += datalen;
4015                 entrybuf->cfe_pending_q.q_end += datalen;
4016         }
4017         CFIL_INFO_VERIFY(cfil_info);
4018         return 0;
4019 }
4020
4021 int
4022 cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, struct sockaddr *to,
4023     struct mbuf *data, struct mbuf *control, uint32_t flags)
4024 {
4025 #pragma unused(to, control, flags)
4026         errno_t error = 0;
4027         unsigned int datalen;
4028         int mbcnt = 0;
4029         int mbnum = 0;
4030         int kcunit;
4031         struct cfi_buf *cfi_buf;
4032         struct mbuf *chain = NULL;
4033
4034         if (cfil_info == NULL) {
4035                 CFIL_LOG(LOG_ERR, "so %llx cfil detached",
4036                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4037                 error = 0;
4038                 goto done;
4039         } else if (cfil_info->cfi_flags & CFIF_DROP) {
4040                 CFIL_LOG(LOG_ERR, "so %llx drop set",
4041                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4042                 error = EPIPE;
4043                 goto done;
4044         }
4045
4046         datalen = cfil_data_length(data, &mbcnt, &mbnum);
4047
4048         if (outgoing) {
4049                 cfi_buf = &cfil_info->cfi_snd;
4050         } else {
4051                 cfi_buf = &cfil_info->cfi_rcv;
4052         }
4053
4054         cfi_buf->cfi_pending_last += datalen;
4055         cfi_buf->cfi_pending_mbcnt += mbcnt;
4056         cfi_buf->cfi_pending_mbnum += mbnum;
4057
4058         if (IS_UDP(so)) {
4059                 if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max ||
4060                     cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) {
4061                         cfi_buf->cfi_tail_drop_cnt++;
4062                         cfi_buf->cfi_pending_mbcnt -= mbcnt;
4063                         cfi_buf->cfi_pending_mbnum -= mbnum;
4064                         return EPIPE;
4065                 }
4066         }
4067
4068         cfil_info_buf_verify(cfi_buf);
4069
4070 #if DATA_DEBUG
4071         CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: <so %llx> %s: data %llx len %u flags 0x%x nextpkt %llx - cfi_pending_last %llu cfi_pending_mbcnt %u   cfi_pass_offset %llu",
4072             (uint64_t)VM_KERNEL_ADDRPERM(so),
4073             outgoing ? "OUT" : "IN",
4074             (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
4075             (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt),
4076             cfi_buf->cfi_pending_last,
4077             cfi_buf->cfi_pending_mbcnt,
4078             cfi_buf->cfi_pass_offset);
4079 #endif
4080
4081         /* Fast path when below pass offset */
4082         if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
4083                 cfil_update_entry_offsets(so, cfil_info, outgoing, datalen);
4084 #if DATA_DEBUG
4085                 CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: FAST PATH");
4086 #endif
4087         } else {
4088                 for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4089                         // Is cfil attached to this filter?
4090                         if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
4091                                 if (IS_UDP(so)) {
4092                                         /* UDP only:
4093                                          * Chain addr (incoming only TDB), control (optional) and data into one chain.
4094                                          * This full chain will be reinjected into socket after recieving verdict.
4095                                          */
4096                                         (void) cfil_udp_save_socket_state(cfil_info, data);
4097                                         chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control);
4098                                         if (chain == NULL) {
4099                                                 return ENOBUFS;
4100                                         }
4101                                         data = chain;
4102                                 }
4103                                 error = cfil_data_filter(so, cfil_info, kcunit, outgoing, data,
4104                                     datalen);
4105                         }
4106                         /* 0 means passed so continue with next filter */
4107                         if (error != 0) {
4108                                 break;
4109                         }
4110                 }
4111         }
4112
4113         /* Move cursor if no filter claimed the data */
4114         if (error == 0) {
4115                 cfi_buf->cfi_pending_first += datalen;
4116                 cfi_buf->cfi_pending_mbcnt -= mbcnt;
4117                 cfi_buf->cfi_pending_mbnum -= mbnum;
4118                 cfil_info_buf_verify(cfi_buf);
4119         }
4120 done:
4121         CFIL_INFO_VERIFY(cfil_info);
4122
4123         return error;
4124 }
4125
4126 /*
4127  * Callback from socket layer sosendxxx()
4128  */
4129 int
4130 cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
4131     struct mbuf *data, struct mbuf *control, uint32_t flags)
4132 {
4133         int error = 0;
4134
4135         if (IS_UDP(so)) {
4136                 return cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags);
4137         }
4138
4139         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4140                 return 0;
4141         }
4142
4143         socket_lock_assert_owned(so);
4144
4145         if (so->so_cfil->cfi_flags & CFIF_DROP) {
4146                 CFIL_LOG(LOG_ERR, "so %llx drop set",
4147                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4148                 return EPIPE;
4149         }
4150         if (control != NULL) {
4151                 CFIL_LOG(LOG_ERR, "so %llx control",
4152                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4153                 OSIncrementAtomic(&cfil_stats.cfs_data_out_control);
4154         }
4155         if ((flags & MSG_OOB)) {
4156                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
4157                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4158                 OSIncrementAtomic(&cfil_stats.cfs_data_out_oob);
4159         }
4160         if ((so->so_snd.sb_flags & SB_LOCK) == 0) {
4161                 panic("so %p SB_LOCK not set", so);
4162         }
4163
4164         if (so->so_snd.sb_cfil_thread != NULL) {
4165                 panic("%s sb_cfil_thread %p not NULL", __func__,
4166                     so->so_snd.sb_cfil_thread);
4167         }
4168
4169         error = cfil_data_common(so, so->so_cfil, 1, to, data, control, flags);
4170
4171         return error;
4172 }
4173
4174 /*
4175  * Callback from socket layer sbappendxxx()
4176  */
4177 int
4178 cfil_sock_data_in(struct socket *so, struct sockaddr *from,
4179     struct mbuf *data, struct mbuf *control, uint32_t flags)
4180 {
4181         int error = 0;
4182
4183         if (IS_UDP(so)) {
4184                 return cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags);
4185         }
4186
4187         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4188                 return 0;
4189         }
4190
4191         socket_lock_assert_owned(so);
4192
4193         if (so->so_cfil->cfi_flags & CFIF_DROP) {
4194                 CFIL_LOG(LOG_ERR, "so %llx drop set",
4195                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4196                 return EPIPE;
4197         }
4198         if (control != NULL) {
4199                 CFIL_LOG(LOG_ERR, "so %llx control",
4200                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4201                 OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
4202         }
4203         if (data->m_type == MT_OOBDATA) {
4204                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
4205                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4206                 OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
4207         }
4208         error = cfil_data_common(so, so->so_cfil, 0, from, data, control, flags);
4209
4210         return error;
4211 }
4212
4213 /*
4214  * Callback from socket layer soshutdownxxx()
4215  *
4216  * We may delay the shutdown write if there's outgoing data in process.
4217  *
4218  * There is no point in delaying the shutdown read because the process
4219  * indicated that it does not want to read anymore data.
4220  */
4221 int
4222 cfil_sock_shutdown(struct socket *so, int *how)
4223 {
4224         int error = 0;
4225
4226         if (IS_UDP(so)) {
4227                 return cfil_sock_udp_shutdown(so, how);
4228         }
4229
4230         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4231                 goto done;
4232         }
4233
4234         socket_lock_assert_owned(so);
4235
4236         CFIL_LOG(LOG_INFO, "so %llx how %d",
4237             (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
4238
4239         /*
4240          * Check the state of the socket before the content filter
4241          */
4242         if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
4243                 /* read already shut down */
4244                 error = ENOTCONN;
4245                 goto done;
4246         }
4247         if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
4248                 /* write already shut down */
4249                 error = ENOTCONN;
4250                 goto done;
4251         }
4252
4253         if ((so->so_cfil->cfi_flags & CFIF_DROP) != 0) {
4254                 CFIL_LOG(LOG_ERR, "so %llx drop set",
4255                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4256                 goto done;
4257         }
4258
4259         /*
4260          * shutdown read: SHUT_RD or SHUT_RDWR
4261          */
4262         if (*how != SHUT_WR) {
4263                 if (so->so_cfil->cfi_flags & CFIF_SHUT_RD) {
4264                         error = ENOTCONN;
4265                         goto done;
4266                 }
4267                 so->so_cfil->cfi_flags |= CFIF_SHUT_RD;
4268                 cfil_sock_notify_shutdown(so, SHUT_RD);
4269         }
4270         /*
4271          * shutdown write: SHUT_WR or SHUT_RDWR
4272          */
4273         if (*how != SHUT_RD) {
4274                 if (so->so_cfil->cfi_flags & CFIF_SHUT_WR) {
4275                         error = ENOTCONN;
4276                         goto done;
4277                 }
4278                 so->so_cfil->cfi_flags |= CFIF_SHUT_WR;
4279                 cfil_sock_notify_shutdown(so, SHUT_WR);
4280                 /*
4281                  * When outgoing data is pending, we delay the shutdown at the
4282                  * protocol level until the content filters give the final
4283                  * verdict on the pending data.
4284                  */
4285                 if (cfil_sock_data_pending(&so->so_snd) != 0) {
4286                         /*
4287                          * When shutting down the read and write sides at once
4288                          * we can proceed to the final shutdown of the read
4289                          * side. Otherwise, we just return.
4290                          */
4291                         if (*how == SHUT_WR) {
4292                                 error = EJUSTRETURN;
4293                         } else if (*how == SHUT_RDWR) {
4294                                 *how = SHUT_RD;
4295                         }
4296                 }
4297         }
4298 done:
4299         return error;
4300 }
4301
4302 /*
4303  * This is called when the socket is closed and there is no more
4304  * opportunity for filtering
4305  */
4306 void
4307 cfil_sock_is_closed(struct socket *so)
4308 {
4309         errno_t error = 0;
4310         int kcunit;
4311
4312         if (IS_UDP(so)) {
4313                 cfil_sock_udp_is_closed(so);
4314                 return;
4315         }
4316
4317         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4318                 return;
4319         }
4320
4321         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
4322
4323         socket_lock_assert_owned(so);
4324
4325         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4326                 /* Let the filters know of the closing */
4327                 error = cfil_dispatch_closed_event(so, so->so_cfil, kcunit);
4328         }
4329
4330         /* Last chance to push passed data out */
4331         error = cfil_acquire_sockbuf(so, so->so_cfil, 1);
4332         if (error == 0) {
4333                 cfil_service_inject_queue(so, so->so_cfil, 1);
4334         }
4335         cfil_release_sockbuf(so, 1);
4336
4337         so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
4338
4339         /* Pending data needs to go */
4340         cfil_flush_queues(so, so->so_cfil);
4341
4342         CFIL_INFO_VERIFY(so->so_cfil);
4343 }
4344
4345 /*
4346  * This is called when the socket is disconnected so let the filters
4347  * know about the disconnection and that no more data will come
4348  *
4349  * The how parameter has the same values as soshutown()
4350  */
4351 void
4352 cfil_sock_notify_shutdown(struct socket *so, int how)
4353 {
4354         errno_t error = 0;
4355         int kcunit;
4356
4357         if (IS_UDP(so)) {
4358                 cfil_sock_udp_notify_shutdown(so, how, 0, 0);
4359                 return;
4360         }
4361
4362         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4363                 return;
4364         }
4365
4366         CFIL_LOG(LOG_INFO, "so %llx how %d",
4367             (uint64_t)VM_KERNEL_ADDRPERM(so), how);
4368
4369         socket_lock_assert_owned(so);
4370
4371         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4372                 /* Disconnect incoming side */
4373                 if (how != SHUT_WR) {
4374                         error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 0);
4375                 }
4376                 /* Disconnect outgoing side */
4377                 if (how != SHUT_RD) {
4378                         error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 1);
4379                 }
4380         }
4381 }
4382
4383 static int
4384 cfil_filters_attached(struct socket *so)
4385 {
4386         struct cfil_entry *entry;
4387         uint32_t kcunit;
4388         int attached = 0;
4389
4390         if (IS_UDP(so)) {
4391                 return cfil_filters_udp_attached(so, FALSE);
4392         }
4393
4394         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4395                 return 0;
4396         }
4397
4398         socket_lock_assert_owned(so);
4399
4400         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
4401                 entry = &so->so_cfil->cfi_entries[kcunit - 1];
4402
4403                 /* Are we attached to the filter? */
4404                 if (entry->cfe_filter == NULL) {
4405                         continue;
4406                 }
4407                 if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
4408                         continue;
4409                 }
4410                 if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0) {
4411                         continue;
4412                 }
4413                 attached = 1;
4414                 break;
4415         }
4416
4417         return attached;
4418 }
4419
4420 /*
4421  * This is called when the socket is closed and we are waiting for
4422  * the filters to gives the final pass or drop
4423  */
4424 void
4425 cfil_sock_close_wait(struct socket *so)
4426 {
4427         lck_mtx_t *mutex_held;
4428         struct timespec ts;
4429         int error;
4430
4431         if (IS_UDP(so)) {
4432                 cfil_sock_udp_close_wait(so);
4433                 return;
4434         }
4435
4436         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4437                 return;
4438         }
4439
4440         CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
4441
4442         if (so->so_proto->pr_getlock != NULL) {
4443                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
4444         } else {
4445                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4446         }
4447         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4448
4449         while (cfil_filters_attached(so)) {
4450                 /*
4451                  * Notify the filters we are going away so they can detach
4452                  */
4453                 cfil_sock_notify_shutdown(so, SHUT_RDWR);
4454
4455                 /*
4456                  * Make sure we need to wait after the filter are notified
4457                  * of the disconnection
4458                  */
4459                 if (cfil_filters_attached(so) == 0) {
4460                         break;
4461                 }
4462
4463                 CFIL_LOG(LOG_INFO, "so %llx waiting",
4464                     (uint64_t)VM_KERNEL_ADDRPERM(so));
4465
4466                 ts.tv_sec = cfil_close_wait_timeout / 1000;
4467                 ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
4468                     NSEC_PER_USEC * 1000;
4469
4470                 OSIncrementAtomic(&cfil_stats.cfs_close_wait);
4471                 so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
4472                 error = msleep((caddr_t)so->so_cfil, mutex_held,
4473                     PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
4474                 so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
4475
4476                 CFIL_LOG(LOG_NOTICE, "so %llx timed out %d",
4477                     (uint64_t)VM_KERNEL_ADDRPERM(so), (error != 0));
4478
4479                 /*
4480                  * Force close in case of timeout
4481                  */
4482                 if (error != 0) {
4483                         OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
4484                         break;
4485                 }
4486         }
4487 }
4488
4489 /*
4490  * Returns the size of the data held by the content filter by using
4491  */
4492 int32_t
4493 cfil_sock_data_pending(struct sockbuf *sb)
4494 {
4495         struct socket *so = sb->sb_so;
4496         uint64_t pending = 0;
4497
4498         if (IS_UDP(so)) {
4499                 return cfil_sock_udp_data_pending(sb, FALSE);
4500         }
4501
4502         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
4503                 struct cfi_buf *cfi_buf;
4504
4505                 socket_lock_assert_owned(so);
4506
4507                 if ((sb->sb_flags & SB_RECV) == 0) {
4508                         cfi_buf = &so->so_cfil->cfi_snd;
4509                 } else {
4510                         cfi_buf = &so->so_cfil->cfi_rcv;
4511                 }
4512
4513                 pending = cfi_buf->cfi_pending_last -
4514                     cfi_buf->cfi_pending_first;
4515
4516                 /*
4517                  * If we are limited by the "chars of mbufs used" roughly
4518                  * adjust so we won't overcommit
4519                  */
4520                 if (pending > (uint64_t)cfi_buf->cfi_pending_mbcnt) {
4521                         pending = cfi_buf->cfi_pending_mbcnt;
4522                 }
4523         }
4524
4525         VERIFY(pending < INT32_MAX);
4526
4527         return (int32_t)(pending);
4528 }
4529
4530 /*
4531  * Return the socket buffer space used by data being held by content filters
4532  * so processes won't clog the socket buffer
4533  */
4534 int32_t
4535 cfil_sock_data_space(struct sockbuf *sb)
4536 {
4537         struct socket *so = sb->sb_so;
4538         uint64_t pending = 0;
4539
4540         if (IS_UDP(so)) {
4541                 return cfil_sock_udp_data_pending(sb, TRUE);
4542         }
4543
4544         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
4545             so->so_snd.sb_cfil_thread != current_thread()) {
4546                 struct cfi_buf *cfi_buf;
4547
4548                 socket_lock_assert_owned(so);
4549
4550                 if ((sb->sb_flags & SB_RECV) == 0) {
4551                         cfi_buf = &so->so_cfil->cfi_snd;
4552                 } else {
4553                         cfi_buf = &so->so_cfil->cfi_rcv;
4554                 }
4555
4556                 pending = cfi_buf->cfi_pending_last -
4557                     cfi_buf->cfi_pending_first;
4558
4559                 /*
4560                  * If we are limited by the "chars of mbufs used" roughly
4561                  * adjust so we won't overcommit
4562                  */
4563                 if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending) {
4564                         pending = cfi_buf->cfi_pending_mbcnt;
4565                 }
4566         }
4567
4568         VERIFY(pending < INT32_MAX);
4569
4570         return (int32_t)(pending);
4571 }
4572
4573 /*
4574  * A callback from the socket and protocol layer when data becomes
4575  * available in the socket buffer to give a chance for the content filter
4576  * to re-inject data that was held back
4577  */
4578 void
4579 cfil_sock_buf_update(struct sockbuf *sb)
4580 {
4581         int outgoing;
4582         int error;
4583         struct socket *so = sb->sb_so;
4584
4585         if (IS_UDP(so)) {
4586                 cfil_sock_udp_buf_update(sb);
4587                 return;
4588         }
4589
4590         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) {
4591                 return;
4592         }
4593
4594         if (!cfil_sbtrim) {
4595                 return;
4596         }
4597
4598         socket_lock_assert_owned(so);
4599
4600         if ((sb->sb_flags & SB_RECV) == 0) {
4601                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0) {
4602                         return;
4603                 }
4604                 outgoing = 1;
4605                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
4606         } else {
4607                 if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_IN) == 0) {
4608                         return;
4609                 }
4610                 outgoing = 0;
4611                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
4612         }
4613
4614         CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
4615             (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
4616
4617         error = cfil_acquire_sockbuf(so, so->so_cfil, outgoing);
4618         if (error == 0) {
4619                 cfil_service_inject_queue(so, so->so_cfil, outgoing);
4620         }
4621         cfil_release_sockbuf(so, outgoing);
4622 }
4623
4624 int
4625 sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
4626     struct sysctl_req *req)
4627 {
4628 #pragma unused(oidp, arg1, arg2)
4629         int error = 0;
4630         size_t len = 0;
4631         u_int32_t i;
4632
4633         /* Read only  */
4634         if (req->newptr != USER_ADDR_NULL) {
4635                 return EPERM;
4636         }
4637
4638         cfil_rw_lock_shared(&cfil_lck_rw);
4639
4640         for (i = 0; content_filters != NULL && i < MAX_CONTENT_FILTER; i++) {
4641                 struct cfil_filter_stat filter_stat;
4642                 struct content_filter *cfc = content_filters[i];
4643
4644                 if (cfc == NULL) {
4645                         continue;
4646                 }
4647
4648                 /* If just asking for the size */
4649                 if (req->oldptr == USER_ADDR_NULL) {
4650                         len += sizeof(struct cfil_filter_stat);
4651                         continue;
4652                 }
4653
4654                 bzero(&filter_stat, sizeof(struct cfil_filter_stat));
4655                 filter_stat.cfs_len = sizeof(struct cfil_filter_stat);
4656                 filter_stat.cfs_filter_id = cfc->cf_kcunit;
4657                 filter_stat.cfs_flags = cfc->cf_flags;
4658                 filter_stat.cfs_sock_count = cfc->cf_sock_count;
4659                 filter_stat.cfs_necp_control_unit = cfc->cf_necp_control_unit;
4660
4661                 error = SYSCTL_OUT(req, &filter_stat,
4662                     sizeof(struct cfil_filter_stat));
4663                 if (error != 0) {
4664                         break;
4665                 }
4666         }
4667         /* If just asking for the size */
4668         if (req->oldptr == USER_ADDR_NULL) {
4669                 req->oldidx = len;
4670         }
4671
4672         cfil_rw_unlock_shared(&cfil_lck_rw);
4673
4674 #if SHOW_DEBUG
4675         if (req->oldptr != USER_ADDR_NULL) {
4676                 for (i = 1; content_filters != NULL && i <= MAX_CONTENT_FILTER; i++) {
4677                         cfil_filter_show(i);
4678                 }
4679         }
4680 #endif
4681
4682         return error;
4683 }
4684
4685 static int
4686 sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
4687     struct sysctl_req *req)
4688 {
4689 #pragma unused(oidp, arg1, arg2)
4690         int error = 0;
4691         u_int32_t i;
4692         struct cfil_info *cfi;
4693
4694         /* Read only  */
4695         if (req->newptr != USER_ADDR_NULL) {
4696                 return EPERM;
4697         }
4698
4699         cfil_rw_lock_shared(&cfil_lck_rw);
4700
4701         /*
4702          * If just asking for the size,
4703          */
4704         if (req->oldptr == USER_ADDR_NULL) {
4705                 req->oldidx = cfil_sock_attached_count *
4706                     sizeof(struct cfil_sock_stat);
4707                 /* Bump the length in case new sockets gets attached */
4708                 req->oldidx += req->oldidx >> 3;
4709                 goto done;
4710         }
4711
4712         TAILQ_FOREACH(cfi, &cfil_sock_head, cfi_link) {
4713                 struct cfil_entry *entry;
4714                 struct cfil_sock_stat stat;
4715                 struct socket *so = cfi->cfi_so;
4716
4717                 bzero(&stat, sizeof(struct cfil_sock_stat));
4718                 stat.cfs_len = sizeof(struct cfil_sock_stat);
4719                 stat.cfs_sock_id = cfi->cfi_sock_id;
4720                 stat.cfs_flags = cfi->cfi_flags;
4721
4722                 if (so != NULL) {
4723                         stat.cfs_pid = so->last_pid;
4724                         memcpy(stat.cfs_uuid, so->last_uuid,
4725                             sizeof(uuid_t));
4726                         if (so->so_flags & SOF_DELEGATED) {
4727                                 stat.cfs_e_pid = so->e_pid;
4728                                 memcpy(stat.cfs_e_uuid, so->e_uuid,
4729                                     sizeof(uuid_t));
4730                         } else {
4731                                 stat.cfs_e_pid = so->last_pid;
4732                                 memcpy(stat.cfs_e_uuid, so->last_uuid,
4733                                     sizeof(uuid_t));
4734                         }
4735
4736                         stat.cfs_sock_family = so->so_proto->pr_domain->dom_family;
4737                         stat.cfs_sock_type = so->so_proto->pr_type;
4738                         stat.cfs_sock_protocol = so->so_proto->pr_protocol;
4739                 }
4740
4741                 stat.cfs_snd.cbs_pending_first =
4742                     cfi->cfi_snd.cfi_pending_first;
4743                 stat.cfs_snd.cbs_pending_last =
4744                     cfi->cfi_snd.cfi_pending_last;
4745                 stat.cfs_snd.cbs_inject_q_len =
4746                     cfil_queue_len(&cfi->cfi_snd.cfi_inject_q);
4747                 stat.cfs_snd.cbs_pass_offset =
4748                     cfi->cfi_snd.cfi_pass_offset;
4749
4750                 stat.cfs_rcv.cbs_pending_first =
4751                     cfi->cfi_rcv.cfi_pending_first;
4752                 stat.cfs_rcv.cbs_pending_last =
4753                     cfi->cfi_rcv.cfi_pending_last;
4754                 stat.cfs_rcv.cbs_inject_q_len =
4755                     cfil_queue_len(&cfi->cfi_rcv.cfi_inject_q);
4756                 stat.cfs_rcv.cbs_pass_offset =
4757                     cfi->cfi_rcv.cfi_pass_offset;
4758
4759                 for (i = 0; i < MAX_CONTENT_FILTER; i++) {
4760                         struct cfil_entry_stat *estat;
4761                         struct cfe_buf *ebuf;
4762                         struct cfe_buf_stat *sbuf;
4763
4764                         entry = &cfi->cfi_entries[i];
4765
4766                         estat = &stat.ces_entries[i];
4767
4768                         estat->ces_len = sizeof(struct cfil_entry_stat);
4769                         estat->ces_filter_id = entry->cfe_filter ?
4770                             entry->cfe_filter->cf_kcunit : 0;
4771                         estat->ces_flags = entry->cfe_flags;
4772                         estat->ces_necp_control_unit =
4773                             entry->cfe_necp_control_unit;
4774
4775                         estat->ces_last_event.tv_sec =
4776                             (int64_t)entry->cfe_last_event.tv_sec;
4777                         estat->ces_last_event.tv_usec =
4778                             (int64_t)entry->cfe_last_event.tv_usec;
4779
4780                         estat->ces_last_action.tv_sec =
4781                             (int64_t)entry->cfe_last_action.tv_sec;
4782                         estat->ces_last_action.tv_usec =
4783                             (int64_t)entry->cfe_last_action.tv_usec;
4784
4785                         ebuf = &entry->cfe_snd;
4786                         sbuf = &estat->ces_snd;
4787                         sbuf->cbs_pending_first =
4788                             cfil_queue_offset_first(&ebuf->cfe_pending_q);
4789                         sbuf->cbs_pending_last =
4790                             cfil_queue_offset_last(&ebuf->cfe_pending_q);
4791                         sbuf->cbs_ctl_first =
4792                             cfil_queue_offset_first(&ebuf->cfe_ctl_q);
4793                         sbuf->cbs_ctl_last =
4794                             cfil_queue_offset_last(&ebuf->cfe_ctl_q);
4795                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
4796                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
4797                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
4798
4799                         ebuf = &entry->cfe_rcv;
4800                         sbuf = &estat->ces_rcv;
4801                         sbuf->cbs_pending_first =
4802                             cfil_queue_offset_first(&ebuf->cfe_pending_q);
4803                         sbuf->cbs_pending_last =
4804                             cfil_queue_offset_last(&ebuf->cfe_pending_q);
4805                         sbuf->cbs_ctl_first =
4806                             cfil_queue_offset_first(&ebuf->cfe_ctl_q);
4807                         sbuf->cbs_ctl_last =
4808                             cfil_queue_offset_last(&ebuf->cfe_ctl_q);
4809                         sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
4810                         sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
4811                         sbuf->cbs_peeked =  ebuf->cfe_peeked;
4812                 }
4813                 error = SYSCTL_OUT(req, &stat,
4814                     sizeof(struct cfil_sock_stat));
4815                 if (error != 0) {
4816                         break;
4817                 }
4818         }
4819 done:
4820         cfil_rw_unlock_shared(&cfil_lck_rw);
4821
4822 #if SHOW_DEBUG
4823         if (req->oldptr != USER_ADDR_NULL) {
4824                 cfil_info_show();
4825         }
4826 #endif
4827
4828         return error;
4829 }
4830
4831 /*
4832  * UDP Socket Support
4833  */
4834 static void
4835 cfil_hash_entry_log(int level, struct socket *so, struct cfil_hash_entry *entry, uint64_t sockId, const char* msg)
4836 {
4837         char local[MAX_IPv6_STR_LEN + 6];
4838         char remote[MAX_IPv6_STR_LEN + 6];
4839         const void  *addr;
4840
4841         // No sock or not UDP, no-op
4842         if (so == NULL || entry == NULL) {
4843                 return;
4844         }
4845
4846         local[0] = remote[0] = 0x0;
4847
4848         switch (entry->cfentry_family) {
4849         case AF_INET6:
4850                 addr = &entry->cfentry_laddr.addr6;
4851                 inet_ntop(AF_INET6, addr, local, sizeof(local));
4852                 addr = &entry->cfentry_faddr.addr6;
4853                 inet_ntop(AF_INET6, addr, remote, sizeof(local));
4854                 break;
4855         case AF_INET:
4856                 addr = &entry->cfentry_laddr.addr46.ia46_addr4.s_addr;
4857                 inet_ntop(AF_INET, addr, local, sizeof(local));
4858                 addr = &entry->cfentry_faddr.addr46.ia46_addr4.s_addr;
4859                 inet_ntop(AF_INET, addr, remote, sizeof(local));
4860                 break;
4861         default:
4862                 return;
4863         }
4864
4865         CFIL_LOG(level, "<%s>: <UDP so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
4866             msg,
4867             (uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId,
4868             ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote);
4869 }
4870
4871 static void
4872 cfil_inp_log(int level, struct socket *so, const char* msg)
4873 {
4874         struct inpcb *inp = NULL;
4875         char local[MAX_IPv6_STR_LEN + 6];
4876         char remote[MAX_IPv6_STR_LEN + 6];
4877         const void  *addr;
4878
4879         if (so == NULL) {
4880                 return;
4881         }
4882
4883         inp = sotoinpcb(so);
4884         if (inp == NULL) {
4885                 return;
4886         }
4887
4888         local[0] = remote[0] = 0x0;
4889
4890 #if INET6
4891         if (inp->inp_vflag & INP_IPV6) {
4892                 addr = &inp->in6p_laddr.s6_addr32;
4893                 inet_ntop(AF_INET6, addr, local, sizeof(local));
4894                 addr = &inp->in6p_faddr.s6_addr32;
4895                 inet_ntop(AF_INET6, addr, remote, sizeof(local));
4896         } else
4897 #endif /* INET6 */
4898         {
4899                 addr = &inp->inp_laddr.s_addr;
4900                 inet_ntop(AF_INET, addr, local, sizeof(local));
4901                 addr = &inp->inp_faddr.s_addr;
4902                 inet_ntop(AF_INET, addr, remote, sizeof(local));
4903         }
4904
4905         if (so->so_cfil != NULL) {
4906                 CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x, sockID %llu> lport %d fport %d laddr %s faddr %s",
4907                     msg, IS_UDP(so) ? "UDP" : "TCP",
4908                     (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags, so->so_cfil->cfi_sock_id,
4909                     ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote);
4910         } else {
4911                 CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x> lport %d fport %d laddr %s faddr %s",
4912                     msg, IS_UDP(so) ? "UDP" : "TCP",
4913                     (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags,
4914                     ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote);
4915         }
4916 }
4917
4918 static void
4919 cfil_info_log(int level, struct cfil_info *cfil_info, const char* msg)
4920 {
4921         if (cfil_info == NULL) {
4922                 return;
4923         }
4924
4925         if (cfil_info->cfi_hash_entry != NULL) {
4926                 cfil_hash_entry_log(level, cfil_info->cfi_so, cfil_info->cfi_hash_entry, cfil_info->cfi_sock_id, msg);
4927         } else {
4928                 cfil_inp_log(level, cfil_info->cfi_so, msg);
4929         }
4930 }
4931
4932 errno_t
4933 cfil_db_init(struct socket *so)
4934 {
4935         errno_t error = 0;
4936         struct cfil_db *db = NULL;
4937
4938         CFIL_LOG(LOG_INFO, "");
4939
4940         db = zalloc(cfil_db_zone);
4941         if (db == NULL) {
4942                 error = ENOMEM;
4943                 goto done;
4944         }
4945         bzero(db, sizeof(struct cfil_db));
4946         db->cfdb_so = so;
4947         db->cfdb_hashbase = hashinit(CFILHASHSIZE, M_CFIL, &db->cfdb_hashmask);
4948         if (db->cfdb_hashbase == NULL) {
4949                 zfree(cfil_db_zone, db);
4950                 db = NULL;
4951                 error = ENOMEM;
4952                 goto done;
4953         }
4954
4955         so->so_cfil_db = db;
4956
4957 done:
4958         return error;
4959 }
4960
4961 void
4962 cfil_db_free(struct socket *so)
4963 {
4964         struct cfil_hash_entry *entry = NULL;
4965         struct cfil_hash_entry *temp_entry = NULL;
4966         struct cfilhashhead *cfilhash = NULL;
4967         struct cfil_db *db = NULL;
4968
4969         CFIL_LOG(LOG_INFO, "");
4970
4971         if (so == NULL || so->so_cfil_db == NULL) {
4972                 return;
4973         }
4974         db = so->so_cfil_db;
4975
4976 #if LIFECYCLE_DEBUG
4977         CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: <so %llx, db %p> freeing db (count == %d)",
4978             (uint64_t)VM_KERNEL_ADDRPERM(so), db, db->cfdb_count);
4979 #endif
4980
4981         for (int i = 0; i < CFILHASHSIZE; i++) {
4982                 cfilhash = &db->cfdb_hashbase[i];
4983                 LIST_FOREACH_SAFE(entry, cfilhash, cfentry_link, temp_entry) {
4984                         if (entry->cfentry_cfil != NULL) {
4985 #if LIFECYCLE_DEBUG
4986                                 cfil_info_log(LOG_ERR, entry->cfentry_cfil, "CFIL: LIFECYCLE: DB FREE CLEAN UP");
4987 #endif
4988                                 cfil_info_free(entry->cfentry_cfil);
4989                                 OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
4990                                 entry->cfentry_cfil = NULL;
4991                         }
4992
4993                         cfil_db_delete_entry(db, entry);
4994                         if (so->so_flags & SOF_CONTENT_FILTER) {
4995                                 if (db->cfdb_count == 0) {
4996                                         so->so_flags &= ~SOF_CONTENT_FILTER;
4997                                 }
4998                                 VERIFY(so->so_usecount > 0);
4999                                 so->so_usecount--;
5000                         }
5001                 }
5002         }
5003
5004         // Make sure all entries are cleaned up!
5005         VERIFY(db->cfdb_count == 0);
5006 #if LIFECYCLE_DEBUG
5007         CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: so usecount %d", so->so_usecount);
5008 #endif
5009
5010         FREE(db->cfdb_hashbase, M_CFIL);
5011         zfree(cfil_db_zone, db);
5012         so->so_cfil_db = NULL;
5013 }
5014
5015 static bool
5016 fill_cfil_hash_entry_from_address(struct cfil_hash_entry *entry, bool isLocal, struct sockaddr *addr)
5017 {
5018         struct sockaddr_in *sin = NULL;
5019         struct sockaddr_in6 *sin6 = NULL;
5020
5021         if (entry == NULL || addr == NULL) {
5022                 return FALSE;
5023         }
5024
5025         switch (addr->sa_family) {
5026         case AF_INET:
5027                 sin = satosin(addr);
5028                 if (sin->sin_len != sizeof(*sin)) {
5029                         return FALSE;
5030                 }
5031                 if (isLocal == TRUE) {
5032                         entry->cfentry_lport = sin->sin_port;
5033                         entry->cfentry_laddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr;
5034                 } else {
5035                         entry->cfentry_fport = sin->sin_port;
5036                         entry->cfentry_faddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr;
5037                 }
5038                 entry->cfentry_family = AF_INET;
5039                 return TRUE;
5040         case AF_INET6:
5041                 sin6 = satosin6(addr);
5042                 if (sin6->sin6_len != sizeof(*sin6)) {
5043                         return FALSE;
5044                 }
5045                 if (isLocal == TRUE) {
5046                         entry->cfentry_lport = sin6->sin6_port;
5047                         entry->cfentry_laddr.addr6 = sin6->sin6_addr;
5048                 } else {
5049                         entry->cfentry_fport = sin6->sin6_port;
5050                         entry->cfentry_faddr.addr6 = sin6->sin6_addr;
5051                 }
5052                 entry->cfentry_family = AF_INET6;
5053                 return TRUE;
5054         default:
5055                 return FALSE;
5056         }
5057 }
5058
5059 static bool
5060 fill_cfil_hash_entry_from_inp(struct cfil_hash_entry *entry, bool isLocal, struct inpcb *inp)
5061 {
5062         if (entry == NULL || inp == NULL) {
5063                 return FALSE;
5064         }
5065
5066         if (inp->inp_vflag & INP_IPV4) {
5067                 if (isLocal == TRUE) {
5068                         entry->cfentry_lport = inp->inp_lport;
5069                         entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
5070                 } else {
5071                         entry->cfentry_fport = inp->inp_fport;
5072                         entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
5073                 }
5074                 entry->cfentry_family = AF_INET;
5075                 return TRUE;
5076         } else if (inp->inp_vflag & INP_IPV6) {
5077                 if (isLocal == TRUE) {
5078                         entry->cfentry_lport = inp->inp_lport;
5079                         entry->cfentry_laddr.addr6 = inp->in6p_laddr;
5080                 } else {
5081                         entry->cfentry_fport = inp->inp_fport;
5082                         entry->cfentry_faddr.addr6 = inp->in6p_faddr;
5083                 }
5084                 entry->cfentry_family = AF_INET6;
5085                 return TRUE;
5086         }
5087         return FALSE;
5088 }
5089
5090 bool
5091 check_port(struct sockaddr *addr, u_short port)
5092 {
5093         struct sockaddr_in *sin = NULL;
5094         struct sockaddr_in6 *sin6 = NULL;
5095
5096         if (addr == NULL || port == 0) {
5097                 return FALSE;
5098         }
5099
5100         switch (addr->sa_family) {
5101         case AF_INET:
5102                 sin = satosin(addr);
5103                 if (sin->sin_len != sizeof(*sin)) {
5104                         return FALSE;
5105                 }
5106                 if (port == ntohs(sin->sin_port)) {
5107                         return TRUE;
5108                 }
5109                 break;
5110         case AF_INET6:
5111                 sin6 = satosin6(addr);
5112                 if (sin6->sin6_len != sizeof(*sin6)) {
5113                         return FALSE;
5114                 }
5115                 if (port == ntohs(sin6->sin6_port)) {
5116                         return TRUE;
5117                 }
5118                 break;
5119         default:
5120                 break;
5121         }
5122         return FALSE;
5123 }
5124
5125 struct cfil_hash_entry *
5126 cfil_db_lookup_entry_with_sockid(struct cfil_db *db, u_int64_t sock_id)
5127 {
5128         struct cfilhashhead *cfilhash = NULL;
5129         u_int32_t flowhash = (u_int32_t)(sock_id & 0x0ffffffff);
5130         struct cfil_hash_entry *nextentry;
5131
5132         if (db == NULL || db->cfdb_hashbase == NULL || sock_id == 0) {
5133                 return NULL;
5134         }
5135
5136         flowhash &= db->cfdb_hashmask;
5137         cfilhash = &db->cfdb_hashbase[flowhash];
5138
5139         LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
5140                 if (nextentry->cfentry_cfil != NULL &&
5141                     nextentry->cfentry_cfil->cfi_sock_id == sock_id) {
5142                         CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> matched <id %llu, hash %u>",
5143                             (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), nextentry->cfentry_cfil->cfi_sock_id, flowhash);
5144                         cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, nextentry, 0, "CFIL: UDP found entry");
5145                         return nextentry;
5146                 }
5147         }
5148
5149         CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> NOT matched <id %llu, hash %u>",
5150             (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), sock_id, flowhash);
5151         return NULL;
5152 }
5153
5154 struct cfil_hash_entry *
5155 cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
5156 {
5157         struct cfil_hash_entry matchentry;
5158         struct cfil_hash_entry *nextentry = NULL;
5159         struct inpcb *inp = sotoinpcb(db->cfdb_so);
5160         u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
5161         int inp_hash_element = 0;
5162         struct cfilhashhead *cfilhash = NULL;
5163
5164         CFIL_LOG(LOG_INFO, "");
5165
5166         if (inp == NULL) {
5167                 goto done;
5168         }
5169
5170         if (local != NULL) {
5171                 fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
5172         } else {
5173                 fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
5174         }
5175         if (remote != NULL) {
5176                 fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote);
5177         } else {
5178                 fill_cfil_hash_entry_from_inp(&matchentry, FALSE, inp);
5179         }
5180
5181 #if INET6
5182         if (inp->inp_vflag & INP_IPV6) {
5183                 hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3];
5184                 hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3];
5185         } else
5186 #endif /* INET6 */
5187         {
5188                 hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr;
5189                 hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr;
5190         }
5191
5192         inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr,
5193             matchentry.cfentry_lport, matchentry.cfentry_fport);
5194         inp_hash_element &= db->cfdb_hashmask;
5195
5196         cfilhash = &db->cfdb_hashbase[inp_hash_element];
5197
5198         LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
5199 #if INET6
5200                 if ((inp->inp_vflag & INP_IPV6) &&
5201                     nextentry->cfentry_lport == matchentry.cfentry_lport &&
5202                     nextentry->cfentry_fport == matchentry.cfentry_fport &&
5203                     IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) &&
5204                     IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) {
5205 #if DATA_DEBUG
5206                         cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry");
5207 #endif
5208                         return nextentry;
5209                 } else
5210 #endif /* INET6 */
5211                 if (nextentry->cfentry_lport == matchentry.cfentry_lport &&
5212                     nextentry->cfentry_fport == matchentry.cfentry_fport &&
5213                     nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr &&
5214                     nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) {
5215 #if DATA_DEBUG
5216                         cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry");
5217 #endif
5218                         return nextentry;
5219                 }
5220         }
5221
5222 done:
5223 #if DATA_DEBUG
5224         cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP no entry found");
5225 #endif
5226         return NULL;
5227 }
5228
5229 void
5230 cfil_db_delete_entry(struct cfil_db *db, struct cfil_hash_entry *hash_entry)
5231 {
5232         if (hash_entry == NULL) {
5233                 return;
5234         }
5235         if (db == NULL || db->cfdb_count == 0) {
5236                 return;
5237         }
5238         db->cfdb_count--;
5239         if (db->cfdb_only_entry == hash_entry) {
5240                 db->cfdb_only_entry = NULL;
5241         }
5242         LIST_REMOVE(hash_entry, cfentry_link);
5243         zfree(cfil_hash_entry_zone, hash_entry);
5244 }
5245
5246 struct cfil_hash_entry *
5247 cfil_db_add_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
5248 {
5249         struct cfil_hash_entry *entry = NULL;
5250         struct inpcb *inp = sotoinpcb(db->cfdb_so);
5251         u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
5252         int inp_hash_element = 0;
5253         struct cfilhashhead *cfilhash = NULL;
5254
5255         CFIL_LOG(LOG_INFO, "");
5256
5257         if (inp == NULL) {
5258                 goto done;
5259         }
5260
5261         entry = zalloc(cfil_hash_entry_zone);
5262         if (entry == NULL) {
5263                 goto done;
5264         }
5265         bzero(entry, sizeof(struct cfil_hash_entry));
5266
5267         if (local != NULL) {
5268                 fill_cfil_hash_entry_from_address(entry, TRUE, local);
5269         } else {
5270                 fill_cfil_hash_entry_from_inp(entry, TRUE, inp);
5271         }
5272         if (remote != NULL) {
5273                 fill_cfil_hash_entry_from_address(entry, FALSE, remote);
5274         } else {
5275                 fill_cfil_hash_entry_from_inp(entry, FALSE, inp);
5276         }
5277         entry->cfentry_lastused = net_uptime();
5278
5279 #if INET6
5280         if (inp->inp_vflag & INP_IPV6) {
5281                 hashkey_faddr = entry->cfentry_faddr.addr6.s6_addr32[3];
5282                 hashkey_laddr = entry->cfentry_laddr.addr6.s6_addr32[3];
5283         } else
5284 #endif /* INET6 */
5285         {
5286                 hashkey_faddr = entry->cfentry_faddr.addr46.ia46_addr4.s_addr;
5287                 hashkey_laddr = entry->cfentry_laddr.addr46.ia46_addr4.s_addr;
5288         }
5289         entry->cfentry_flowhash = CFIL_HASH(hashkey_laddr, hashkey_faddr,
5290             entry->cfentry_lport, entry->cfentry_fport);
5291         inp_hash_element = entry->cfentry_flowhash & db->cfdb_hashmask;
5292
5293         cfilhash = &db->cfdb_hashbase[inp_hash_element];
5294
5295         LIST_INSERT_HEAD(cfilhash, entry, cfentry_link);
5296         db->cfdb_count++;
5297         db->cfdb_only_entry = entry;
5298         cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: ADDED");
5299
5300 done:
5301         CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> total count %d", (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), db->cfdb_count);
5302         return entry;
5303 }
5304
5305 struct cfil_info *
5306 cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
5307 {
5308         struct cfil_hash_entry *hash_entry = NULL;
5309
5310         CFIL_LOG(LOG_INFO, "");
5311
5312         if (db == NULL || id == 0) {
5313                 CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> NULL DB <id %llu>",
5314                     (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), id);
5315                 return NULL;
5316         }
5317
5318         // This is an optimization for connected UDP socket which only has one flow.
5319         // No need to do the hash lookup.
5320         if (db->cfdb_count == 1) {
5321                 if (db->cfdb_only_entry && db->cfdb_only_entry->cfentry_cfil &&
5322                     db->cfdb_only_entry->cfentry_cfil->cfi_sock_id == id) {
5323                         return db->cfdb_only_entry->cfentry_cfil;
5324                 }
5325         }
5326
5327         hash_entry = cfil_db_lookup_entry_with_sockid(db, id);
5328         return hash_entry != NULL ? hash_entry->cfentry_cfil : NULL;
5329 }
5330
5331 struct cfil_hash_entry *
5332 cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote)
5333 {
5334 #pragma unused(so, filter_control_unit, outgoing, local, remote)
5335         struct cfil_hash_entry *hash_entry = NULL;
5336
5337         errno_t error = 0;
5338         socket_lock_assert_owned(so);
5339
5340         // If new socket, allocate cfil db
5341         if (so->so_cfil_db == NULL) {
5342                 if (cfil_db_init(so) != 0) {
5343                         return NULL;
5344                 }
5345         }
5346
5347         // See if flow already exists.
5348         hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote);
5349         if (hash_entry != NULL) {
5350                 return hash_entry;
5351         }
5352
5353         hash_entry = cfil_db_add_entry(so->so_cfil_db, local, remote);
5354         if (hash_entry == NULL) {
5355                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
5356                 CFIL_LOG(LOG_ERR, "CFIL: UDP failed to add entry");
5357                 return NULL;
5358         }
5359
5360         if (cfil_info_alloc(so, hash_entry) == NULL ||
5361             hash_entry->cfentry_cfil == NULL) {
5362                 cfil_db_delete_entry(so->so_cfil_db, hash_entry);
5363                 CFIL_LOG(LOG_ERR, "CFIL: UDP failed to alloc cfil_info");
5364                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
5365                 return NULL;
5366         }
5367
5368 #if LIFECYCLE_DEBUG
5369         cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
5370 #endif
5371
5372         if (cfil_info_attach_unit(so, filter_control_unit, hash_entry->cfentry_cfil) == 0) {
5373                 cfil_info_free(hash_entry->cfentry_cfil);
5374                 cfil_db_delete_entry(so->so_cfil_db, hash_entry);
5375                 CFIL_LOG(LOG_ERR, "CFIL: UDP cfil_info_attach_unit(%u) failed",
5376                     filter_control_unit);
5377                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
5378                 return NULL;
5379         }
5380         CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> filter_control_unit %u sockID %llu attached",
5381             (uint64_t)VM_KERNEL_ADDRPERM(so),
5382             filter_control_unit, hash_entry->cfentry_cfil->cfi_sock_id);
5383
5384         so->so_flags |= SOF_CONTENT_FILTER;
5385         OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
5386
5387         /* Hold a reference on the socket for each flow */
5388         so->so_usecount++;
5389
5390         error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, filter_control_unit);
5391         /* We can recover from flow control or out of memory errors */
5392         if (error != 0 && error != ENOBUFS && error != ENOMEM) {
5393                 return NULL;
5394         }
5395
5396         CFIL_INFO_VERIFY(hash_entry->cfentry_cfil);
5397         return hash_entry;
5398 }
5399
5400 errno_t
5401 cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
5402     struct sockaddr *local, struct sockaddr *remote,
5403     struct mbuf *data, struct mbuf *control, uint32_t flags)
5404 {
5405 #pragma unused(outgoing, so, local, remote, data, control, flags)
5406         errno_t error = 0;
5407         uint32_t filter_control_unit;
5408         struct cfil_hash_entry *hash_entry = NULL;
5409         struct cfil_info *cfil_info = NULL;
5410
5411         socket_lock_assert_owned(so);
5412
5413         if (cfil_active_count == 0) {
5414                 CFIL_LOG(LOG_DEBUG, "CFIL: UDP no active filter");
5415                 OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
5416                 return error;
5417         }
5418
5419         filter_control_unit = necp_socket_get_content_filter_control_unit(so);
5420         if (filter_control_unit == 0) {
5421                 CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit");
5422                 return error;
5423         }
5424
5425         if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
5426                 CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only");
5427                 OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
5428                 return error;
5429         }
5430
5431         hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote);
5432         if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) {
5433                 CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow");
5434                 return EPIPE;
5435         }
5436         // Update last used timestamp, this is for flow Idle TO
5437         hash_entry->cfentry_lastused = net_uptime();
5438         cfil_info = hash_entry->cfentry_cfil;
5439
5440         if (cfil_info->cfi_flags & CFIF_DROP) {
5441 #if DATA_DEBUG
5442                 cfil_hash_entry_log(LOG_DEBUG, so, hash_entry, 0, "CFIL: UDP DROP");
5443 #endif
5444                 return EPIPE;
5445         }
5446         if (control != NULL) {
5447                 OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
5448         }
5449         if (data->m_type == MT_OOBDATA) {
5450                 CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
5451                     (uint64_t)VM_KERNEL_ADDRPERM(so));
5452                 OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
5453         }
5454
5455         error = cfil_data_common(so, cfil_info, outgoing, remote, data, control, flags);
5456
5457         return error;
5458 }
5459
5460 /*
5461  * Go through all UDP flows for specified socket and returns TRUE if
5462  * any flow is still attached.  If need_wait is TRUE, wait on first
5463  * attached flow.
5464  */
5465 static int
5466 cfil_filters_udp_attached(struct socket *so, bool need_wait)
5467 {
5468         struct timespec ts;
5469         lck_mtx_t *mutex_held;
5470         struct cfilhashhead *cfilhash = NULL;
5471         struct cfil_db *db = NULL;
5472         struct cfil_hash_entry *hash_entry = NULL;
5473         struct cfil_hash_entry *temp_hash_entry = NULL;
5474         struct cfil_info *cfil_info = NULL;
5475         struct cfil_entry *entry = NULL;
5476         errno_t error = 0;
5477         int kcunit;
5478         int attached = 0;
5479         uint64_t sock_flow_id = 0;
5480
5481         socket_lock_assert_owned(so);
5482
5483         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
5484                 if (so->so_proto->pr_getlock != NULL) {
5485                         mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
5486                 } else {
5487                         mutex_held = so->so_proto->pr_domain->dom_mtx;
5488                 }
5489                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
5490
5491                 db = so->so_cfil_db;
5492
5493                 for (int i = 0; i < CFILHASHSIZE; i++) {
5494                         cfilhash = &db->cfdb_hashbase[i];
5495
5496                         LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
5497                                 if (hash_entry->cfentry_cfil != NULL) {
5498                                         cfil_info = hash_entry->cfentry_cfil;
5499                                         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5500                                                 entry = &cfil_info->cfi_entries[kcunit - 1];
5501
5502                                                 /* Are we attached to the filter? */
5503                                                 if (entry->cfe_filter == NULL) {
5504                                                         continue;
5505                                                 }
5506
5507                                                 if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
5508                                                         continue;
5509                                                 }
5510                                                 if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0) {
5511                                                         continue;
5512                                                 }
5513
5514                                                 attached = 1;
5515
5516                                                 if (need_wait == TRUE) {
5517 #if LIFECYCLE_DEBUG
5518                                                         cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TO FINISH");
5519 #endif
5520
5521                                                         ts.tv_sec = cfil_close_wait_timeout / 1000;
5522                                                         ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
5523                                                             NSEC_PER_USEC * 1000;
5524
5525                                                         OSIncrementAtomic(&cfil_stats.cfs_close_wait);
5526                                                         cfil_info->cfi_flags |= CFIF_CLOSE_WAIT;
5527                                                         sock_flow_id = cfil_info->cfi_sock_id;
5528
5529                                                         error = msleep((caddr_t)cfil_info, mutex_held,
5530                                                             PSOCK | PCATCH, "cfil_filters_udp_attached", &ts);
5531
5532                                                         // Woke up from sleep, validate if cfil_info is still valid
5533                                                         if (so->so_cfil_db == NULL ||
5534                                                             (cfil_info != cfil_db_get_cfil_info(so->so_cfil_db, sock_flow_id))) {
5535                                                                 // cfil_info is not valid, do not continue
5536                                                                 goto done;
5537                                                         }
5538
5539                                                         cfil_info->cfi_flags &= ~CFIF_CLOSE_WAIT;
5540
5541 #if LIFECYCLE_DEBUG
5542                                                         cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW DONE");
5543 #endif
5544
5545                                                         /*
5546                                                          * Force close in case of timeout
5547                                                          */
5548                                                         if (error != 0) {
5549                                                                 OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
5550 #if LIFECYCLE_DEBUG
5551                                                                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TIMED OUT, FORCE DETACH");
5552 #endif
5553                                                                 entry->cfe_flags |= CFEF_CFIL_DETACHED;
5554                                                         }
5555                                                 }
5556                                                 goto done;
5557                                         }
5558                                 }
5559                         }
5560                 }
5561         }
5562
5563 done:
5564         return attached;
5565 }
5566
5567 int32_t
5568 cfil_sock_udp_data_pending(struct sockbuf *sb, bool check_thread)
5569 {
5570         struct socket *so = sb->sb_so;
5571         struct cfi_buf *cfi_buf;
5572         uint64_t pending = 0;
5573         uint64_t total_pending = 0;
5574         struct cfilhashhead *cfilhash = NULL;
5575         struct cfil_db *db = NULL;
5576         struct cfil_hash_entry *hash_entry = NULL;
5577         struct cfil_hash_entry *temp_hash_entry = NULL;
5578
5579         socket_lock_assert_owned(so);
5580
5581         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL &&
5582             (check_thread == FALSE || so->so_snd.sb_cfil_thread != current_thread())) {
5583                 db = so->so_cfil_db;
5584
5585                 for (int i = 0; i < CFILHASHSIZE; i++) {
5586                         cfilhash = &db->cfdb_hashbase[i];
5587
5588                         LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
5589                                 if (hash_entry->cfentry_cfil != NULL) {
5590                                         if ((sb->sb_flags & SB_RECV) == 0) {
5591                                                 cfi_buf = &hash_entry->cfentry_cfil->cfi_snd;
5592                                         } else {
5593                                                 cfi_buf = &hash_entry->cfentry_cfil->cfi_rcv;
5594                                         }
5595
5596                                         pending = cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first;
5597                                         /*
5598                                          * If we are limited by the "chars of mbufs used" roughly
5599                                          * adjust so we won't overcommit
5600                                          */
5601                                         if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending) {
5602                                                 pending = cfi_buf->cfi_pending_mbcnt;
5603                                         }
5604
5605                                         total_pending += pending;
5606                                 }
5607                         }
5608                 }
5609
5610                 VERIFY(total_pending < INT32_MAX);
5611 #if DATA_DEBUG
5612                 CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx> total pending %llu <check_thread %d>",
5613                     (uint64_t)VM_KERNEL_ADDRPERM(so),
5614                     total_pending, check_thread);
5615 #endif
5616         }
5617
5618         return (int32_t)(total_pending);
5619 }
5620
5621 int
5622 cfil_sock_udp_notify_shutdown(struct socket *so, int how, int drop_flag, int shut_flag)
5623 {
5624         struct cfil_info *cfil_info = NULL;
5625         struct cfilhashhead *cfilhash = NULL;
5626         struct cfil_db *db = NULL;
5627         struct cfil_hash_entry *hash_entry = NULL;
5628         struct cfil_hash_entry *temp_hash_entry = NULL;
5629         errno_t error = 0;
5630         int done_count = 0;
5631         int kcunit;
5632
5633         socket_lock_assert_owned(so);
5634
5635         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
5636                 db = so->so_cfil_db;
5637
5638                 for (int i = 0; i < CFILHASHSIZE; i++) {
5639                         cfilhash = &db->cfdb_hashbase[i];
5640
5641                         LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
5642                                 if (hash_entry->cfentry_cfil != NULL) {
5643                                         cfil_info = hash_entry->cfentry_cfil;
5644
5645                                         // This flow is marked as DROP
5646                                         if (cfil_info->cfi_flags & drop_flag) {
5647                                                 done_count++;
5648                                                 continue;
5649                                         }
5650
5651                                         // This flow has been shut already, skip
5652                                         if (cfil_info->cfi_flags & shut_flag) {
5653                                                 continue;
5654                                         }
5655                                         // Mark flow as shut
5656                                         cfil_info->cfi_flags |= shut_flag;
5657                                         done_count++;
5658
5659                                         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5660                                                 /* Disconnect incoming side */
5661                                                 if (how != SHUT_WR) {
5662                                                         error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
5663                                                 }
5664                                                 /* Disconnect outgoing side */
5665                                                 if (how != SHUT_RD) {
5666                                                         error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
5667                                                 }
5668                                         }
5669                                 }
5670                         }
5671                 }
5672         }
5673
5674         if (done_count == 0) {
5675                 error = ENOTCONN;
5676         }
5677         return error;
5678 }
5679
5680 int
5681 cfil_sock_udp_shutdown(struct socket *so, int *how)
5682 {
5683         int error = 0;
5684
5685         if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || (so->so_cfil_db == NULL)) {
5686                 goto done;
5687         }
5688
5689         socket_lock_assert_owned(so);
5690
5691         CFIL_LOG(LOG_INFO, "so %llx how %d",
5692             (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
5693
5694         /*
5695          * Check the state of the socket before the content filter
5696          */
5697         if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
5698                 /* read already shut down */
5699                 error = ENOTCONN;
5700                 goto done;
5701         }
5702         if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
5703                 /* write already shut down */
5704                 error = ENOTCONN;
5705                 goto done;
5706         }
5707
5708         /*
5709          * shutdown read: SHUT_RD or SHUT_RDWR
5710          */
5711         if (*how != SHUT_WR) {
5712                 error = cfil_sock_udp_notify_shutdown(so, SHUT_RD, CFIF_DROP, CFIF_SHUT_RD);
5713                 if (error != 0) {
5714                         goto done;
5715                 }
5716         }
5717         /*
5718          * shutdown write: SHUT_WR or SHUT_RDWR
5719          */
5720         if (*how != SHUT_RD) {
5721                 error = cfil_sock_udp_notify_shutdown(so, SHUT_WR, CFIF_DROP, CFIF_SHUT_WR);
5722                 if (error != 0) {
5723                         goto done;
5724                 }
5725
5726                 /*
5727                  * When outgoing data is pending, we delay the shutdown at the
5728                  * protocol level until the content filters give the final
5729                  * verdict on the pending data.
5730                  */
5731                 if (cfil_sock_data_pending(&so->so_snd) != 0) {
5732                         /*
5733                          * When shutting down the read and write sides at once
5734                          * we can proceed to the final shutdown of the read
5735                          * side. Otherwise, we just return.
5736                          */
5737                         if (*how == SHUT_WR) {
5738                                 error = EJUSTRETURN;
5739                         } else if (*how == SHUT_RDWR) {
5740                                 *how = SHUT_RD;
5741                         }
5742                 }
5743         }
5744 done:
5745         return error;
5746 }
5747
5748 void
5749 cfil_sock_udp_close_wait(struct socket *so)
5750 {
5751         socket_lock_assert_owned(so);
5752
5753         while (cfil_filters_udp_attached(so, FALSE)) {
5754                 /*
5755                  * Notify the filters we are going away so they can detach
5756                  */
5757                 cfil_sock_udp_notify_shutdown(so, SHUT_RDWR, 0, 0);
5758
5759                 /*
5760                  * Make sure we need to wait after the filter are notified
5761                  * of the disconnection
5762                  */
5763                 if (cfil_filters_udp_attached(so, TRUE) == 0) {
5764                         break;
5765                 }
5766         }
5767 }
5768
5769 void
5770 cfil_sock_udp_is_closed(struct socket *so)
5771 {
5772         struct cfil_info *cfil_info = NULL;
5773         struct cfilhashhead *cfilhash = NULL;
5774         struct cfil_db *db = NULL;
5775         struct cfil_hash_entry *hash_entry = NULL;
5776         struct cfil_hash_entry *temp_hash_entry = NULL;
5777         errno_t error = 0;
5778         int kcunit;
5779
5780         socket_lock_assert_owned(so);
5781
5782         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
5783                 db = so->so_cfil_db;
5784
5785                 for (int i = 0; i < CFILHASHSIZE; i++) {
5786                         cfilhash = &db->cfdb_hashbase[i];
5787
5788                         LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
5789                                 if (hash_entry->cfentry_cfil != NULL) {
5790                                         cfil_info = hash_entry->cfentry_cfil;
5791
5792                                         for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
5793                                                 /* Let the filters know of the closing */
5794                                                 error = cfil_dispatch_closed_event(so, cfil_info, kcunit);
5795                                         }
5796
5797                                         /* Last chance to push passed data out */
5798                                         error = cfil_acquire_sockbuf(so, cfil_info, 1);
5799                                         if (error == 0) {
5800                                                 cfil_service_inject_queue(so, cfil_info, 1);
5801                                         }
5802                                         cfil_release_sockbuf(so, 1);
5803
5804                                         cfil_info->cfi_flags |= CFIF_SOCK_CLOSED;
5805
5806                                         /* Pending data needs to go */
5807                                         cfil_flush_queues(so, cfil_info);
5808
5809                                         CFIL_INFO_VERIFY(cfil_info);
5810                                 }
5811                         }
5812                 }
5813         }
5814 }
5815
5816 void
5817 cfil_sock_udp_buf_update(struct sockbuf *sb)
5818 {
5819         struct cfil_info *cfil_info = NULL;
5820         struct cfilhashhead *cfilhash = NULL;
5821         struct cfil_db *db = NULL;
5822         struct cfil_hash_entry *hash_entry = NULL;
5823         struct cfil_hash_entry *temp_hash_entry = NULL;
5824         errno_t error = 0;
5825         int outgoing;
5826         struct socket *so = sb->sb_so;
5827
5828         socket_lock_assert_owned(so);
5829
5830         if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
5831                 if (!cfil_sbtrim) {
5832                         return;
5833                 }
5834
5835                 db = so->so_cfil_db;
5836
5837                 for (int i = 0; i < CFILHASHSIZE; i++) {
5838                         cfilhash = &db->cfdb_hashbase[i];
5839
5840                         LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
5841                                 if (hash_entry->cfentry_cfil != NULL) {
5842                                         cfil_info = hash_entry->cfentry_cfil;
5843
5844                                         if ((sb->sb_flags & SB_RECV) == 0) {
5845                                                 if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0) {
5846                                                         return;
5847                                                 }
5848                                                 outgoing = 1;
5849                                                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
5850                                         } else {
5851                                                 if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) == 0) {
5852                                                         return;
5853                                                 }
5854                                                 outgoing = 0;
5855                                                 OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
5856                                         }
5857
5858                                         CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
5859                                             (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
5860
5861                                         error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
5862                                         if (error == 0) {
5863                                                 cfil_service_inject_queue(so, cfil_info, outgoing);
5864                                         }
5865                                         cfil_release_sockbuf(so, outgoing);
5866                                 }
5867                         }
5868                 }
5869         }
5870 }
5871
5872 void
5873 cfil_filter_show(u_int32_t kcunit)
5874 {
5875         struct content_filter *cfc = NULL;
5876         struct cfil_entry *entry;
5877         int count = 0;
5878
5879         if (content_filters == NULL) {
5880                 return;
5881         }
5882         if (kcunit > MAX_CONTENT_FILTER) {
5883                 return;
5884         }
5885
5886         cfil_rw_lock_shared(&cfil_lck_rw);
5887
5888         if (content_filters[kcunit - 1] == NULL) {
5889                 cfil_rw_unlock_shared(&cfil_lck_rw);
5890                 return;
5891         }
5892         cfc = content_filters[kcunit - 1];
5893
5894         CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter <unit %d, entry count %d> flags <%lx>:",
5895             kcunit, cfc->cf_sock_count, (unsigned long)cfc->cf_flags);
5896         if (cfc->cf_flags & CFF_DETACHING) {
5897                 CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - DETACHING");
5898         }
5899         if (cfc->cf_flags & CFF_ACTIVE) {
5900                 CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - ACTIVE");
5901         }
5902         if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
5903                 CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - FLOW CONTROLLED");
5904         }
5905
5906         TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
5907                 if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
5908                         struct cfil_info *cfil_info = entry->cfe_cfil_info;
5909
5910                         count++;
5911
5912                         if (entry->cfe_flags & CFEF_CFIL_DETACHED) {
5913                                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - DETACHED");
5914                         } else {
5915                                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - ATTACHED");
5916                         }
5917                 }
5918         }
5919
5920         CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter - total entries shown: %d", count);
5921
5922         cfil_rw_unlock_shared(&cfil_lck_rw);
5923 }
5924
5925 void
5926 cfil_info_show(void)
5927 {
5928         struct cfil_info *cfil_info;
5929         int count = 0;
5930
5931         cfil_rw_lock_shared(&cfil_lck_rw);
5932
5933         CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: count %d", cfil_sock_attached_count);
5934
5935         TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) {
5936                 count++;
5937
5938                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: INFO SHOW");
5939
5940                 if (cfil_info->cfi_flags & CFIF_DROP) {
5941                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - DROP");
5942                 }
5943                 if (cfil_info->cfi_flags & CFIF_CLOSE_WAIT) {
5944                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - CLOSE_WAIT");
5945                 }
5946                 if (cfil_info->cfi_flags & CFIF_SOCK_CLOSED) {
5947                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SOCK_CLOSED");
5948                 }
5949                 if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) {
5950                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_IN");
5951                 }
5952                 if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) {
5953                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_OUT");
5954                 }
5955                 if (cfil_info->cfi_flags & CFIF_SHUT_WR) {
5956                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_WR");
5957                 }
5958                 if (cfil_info->cfi_flags & CFIF_SHUT_RD) {
5959                         CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_RD");
5960                 }
5961         }
5962
5963         CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: total cfil_info shown: %d", count);
5964
5965         cfil_rw_unlock_shared(&cfil_lck_rw);
5966 }
5967
5968 bool
5969 cfil_info_idle_timed_out(struct cfil_info *cfil_info, int timeout, u_int32_t current_time)
5970 {
5971         if (cfil_info && cfil_info->cfi_hash_entry &&
5972             (current_time - cfil_info->cfi_hash_entry->cfentry_lastused >= (u_int32_t)timeout)) {
5973 #if GC_DEBUG
5974                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow IDLE timeout expired");
5975 #endif
5976                 return true;
5977         }
5978         return false;
5979 }
5980
5981 bool
5982 cfil_info_action_timed_out(struct cfil_info *cfil_info, int timeout)
5983 {
5984         struct cfil_entry *entry;
5985         struct timeval current_tv;
5986         struct timeval diff_time;
5987
5988         if (cfil_info == NULL) {
5989                 return false;
5990         }
5991
5992         /*
5993          * If we have queued up more data than passed offset and we haven't received
5994          * an action from user space for a while (the user space filter might have crashed),
5995          * return action timed out.
5996          */
5997         if (cfil_info->cfi_snd.cfi_pending_last > cfil_info->cfi_snd.cfi_pass_offset ||
5998             cfil_info->cfi_rcv.cfi_pending_last > cfil_info->cfi_rcv.cfi_pass_offset) {
5999                 microuptime(&current_tv);
6000
6001                 for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
6002                         entry = &cfil_info->cfi_entries[kcunit - 1];
6003
6004                         if (entry->cfe_filter == NULL) {
6005                                 continue;
6006                         }
6007
6008                         if (cfil_info->cfi_snd.cfi_pending_last > entry->cfe_snd.cfe_pass_offset ||
6009                             cfil_info->cfi_rcv.cfi_pending_last > entry->cfe_rcv.cfe_pass_offset) {
6010                                 // haven't gotten an action from this filter, check timeout
6011                                 timersub(&current_tv, &entry->cfe_last_action, &diff_time);
6012                                 if (diff_time.tv_sec >= timeout) {
6013 #if GC_DEBUG
6014                                         cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow ACTION timeout expired");
6015 #endif
6016                                         return true;
6017                                 }
6018                         }
6019                 }
6020         }
6021         return false;
6022 }
6023
6024 bool
6025 cfil_info_buffer_threshold_exceeded(struct cfil_info *cfil_info)
6026 {
6027         if (cfil_info == NULL) {
6028                 return false;
6029         }
6030
6031         /*
6032          * Clean up flow if it exceeded queue thresholds
6033          */
6034         if (cfil_info->cfi_snd.cfi_tail_drop_cnt ||
6035             cfil_info->cfi_rcv.cfi_tail_drop_cnt) {
6036 #if GC_DEBUG
6037                 CFIL_LOG(LOG_ERR, "CFIL: queue threshold exceeded: mbuf max <count: %d bytes: %d> tail drop count <OUT: %d IN: %d>",
6038                     cfil_udp_gc_mbuf_num_max,
6039                     cfil_udp_gc_mbuf_cnt_max,
6040                     cfil_info->cfi_snd.cfi_tail_drop_cnt,
6041                     cfil_info->cfi_rcv.cfi_tail_drop_cnt);
6042                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: queue threshold exceeded");
6043 #endif
6044                 return true;
6045         }
6046
6047         return false;
6048 }
6049
6050 static void
6051 cfil_udp_gc_thread_sleep(bool forever)
6052 {
6053         if (forever) {
6054                 (void) assert_wait((event_t) &cfil_sock_udp_attached_count,
6055                     THREAD_INTERRUPTIBLE);
6056         } else {
6057                 uint64_t deadline = 0;
6058                 nanoseconds_to_absolutetime(UDP_FLOW_GC_RUN_INTERVAL_NSEC, &deadline);
6059                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
6060
6061                 (void) assert_wait_deadline(&cfil_sock_udp_attached_count,
6062                     THREAD_INTERRUPTIBLE, deadline);
6063         }
6064 }
6065
6066 static void
6067 cfil_udp_gc_thread_func(void *v, wait_result_t w)
6068 {
6069 #pragma unused(v, w)
6070
6071         ASSERT(cfil_udp_gc_thread == current_thread());
6072         thread_set_thread_name(current_thread(), "CFIL_UPD_GC");
6073
6074         // Kick off gc shortly
6075         cfil_udp_gc_thread_sleep(false);
6076         thread_block_parameter((thread_continue_t) cfil_info_udp_expire, NULL);
6077         /* NOTREACHED */
6078 }
6079
6080 static void
6081 cfil_info_udp_expire(void *v, wait_result_t w)
6082 {
6083 #pragma unused(v, w)
6084
6085         static uint64_t expired_array[UDP_FLOW_GC_MAX_COUNT];
6086         static uint32_t expired_count = 0;
6087
6088         struct cfil_info *cfil_info;
6089         struct cfil_hash_entry *hash_entry;
6090         struct cfil_db *db;
6091         struct socket *so;
6092         u_int32_t current_time = 0;
6093
6094         current_time = net_uptime();
6095
6096         // Get all expired UDP flow ids
6097         cfil_rw_lock_shared(&cfil_lck_rw);
6098
6099         if (cfil_sock_udp_attached_count == 0) {
6100                 cfil_rw_unlock_shared(&cfil_lck_rw);
6101                 goto go_sleep;
6102         }
6103
6104         TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) {
6105                 if (expired_count >= UDP_FLOW_GC_MAX_COUNT) {
6106                         break;
6107                 }
6108
6109                 if (IS_UDP(cfil_info->cfi_so)) {
6110                         if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) ||
6111                             cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) ||
6112                             cfil_info_buffer_threshold_exceeded(cfil_info)) {
6113                                 expired_array[expired_count] = cfil_info->cfi_sock_id;
6114                                 expired_count++;
6115                         }
6116                 }
6117         }
6118         cfil_rw_unlock_shared(&cfil_lck_rw);
6119
6120         if (expired_count == 0) {
6121                 goto go_sleep;
6122         }
6123
6124         for (uint32_t i = 0; i < expired_count; i++) {
6125                 // Search for socket (UDP only and lock so)
6126                 so = cfil_socket_from_sock_id(expired_array[i], true);
6127                 if (so == NULL) {
6128                         continue;
6129                 }
6130
6131                 cfil_info = cfil_db_get_cfil_info(so->so_cfil_db, expired_array[i]);
6132                 if (cfil_info == NULL) {
6133                         goto unlock;
6134                 }
6135
6136                 db = so->so_cfil_db;
6137                 hash_entry = cfil_info->cfi_hash_entry;
6138
6139                 if (db == NULL || hash_entry == NULL) {
6140                         goto unlock;
6141                 }
6142
6143 #if GC_DEBUG || LIFECYCLE_DEBUG
6144                 cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: GC CLEAN UP");
6145 #endif
6146
6147                 cfil_db_delete_entry(db, hash_entry);
6148                 cfil_info_free(cfil_info);
6149                 OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
6150
6151                 if (so->so_flags & SOF_CONTENT_FILTER) {
6152                         if (db->cfdb_count == 0) {
6153                                 so->so_flags &= ~SOF_CONTENT_FILTER;
6154                         }
6155                         VERIFY(so->so_usecount > 0);
6156                         so->so_usecount--;
6157                 }
6158 unlock:
6159                 socket_unlock(so, 1);
6160         }
6161
6162 #if GC_DEBUG
6163         CFIL_LOG(LOG_ERR, "CFIL: UDP flow idle timeout check: expired %d idle flows", expired_count);
6164 #endif
6165         expired_count = 0;
6166
6167 go_sleep:
6168
6169         // Sleep forever (until waken up) if no more UDP flow to clean
6170         cfil_rw_lock_shared(&cfil_lck_rw);
6171         cfil_udp_gc_thread_sleep(cfil_sock_udp_attached_count == 0 ? true : false);
6172         cfil_rw_unlock_shared(&cfil_lck_rw);
6173         thread_block_parameter((thread_continue_t)cfil_info_udp_expire, NULL);
6174         /* NOTREACHED */
6175 }
6176
6177 struct m_tag *
6178 cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
6179 {
6180         struct m_tag *tag = NULL;
6181         struct cfil_tag *ctag = NULL;
6182         struct cfil_hash_entry *hash_entry = NULL;
6183
6184         if (cfil_info == NULL || cfil_info->cfi_so == NULL ||
6185             cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) {
6186                 return NULL;
6187         }
6188
6189         /* Allocate a tag */
6190         tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP,
6191             sizeof(struct cfil_tag), M_DONTWAIT, m);
6192
6193         if (tag) {
6194                 ctag = (struct cfil_tag*)(tag + 1);
6195                 ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt;
6196                 ctag->cfil_so_options = cfil_info->cfi_so->so_options;
6197
6198                 hash_entry = cfil_info->cfi_hash_entry;
6199                 if (hash_entry->cfentry_family == AF_INET6) {
6200                         fill_ip6_sockaddr_4_6(&ctag->cfil_faddr,
6201                             &hash_entry->cfentry_faddr.addr6,
6202                             hash_entry->cfentry_fport);
6203                 } else if (hash_entry->cfentry_family == AF_INET) {
6204                         fill_ip_sockaddr_4_6(&ctag->cfil_faddr,
6205                             hash_entry->cfentry_faddr.addr46.ia46_addr4,
6206                             hash_entry->cfentry_fport);
6207                 }
6208                 m_tag_prepend(m, tag);
6209                 return tag;
6210         }
6211         return NULL;
6212 }
6213
6214 struct m_tag *
6215 cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
6216     struct sockaddr **faddr)
6217 {
6218         struct m_tag *tag = NULL;
6219         struct cfil_tag *ctag = NULL;
6220
6221         tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL);
6222         if (tag) {
6223                 ctag = (struct cfil_tag *)(tag + 1);
6224                 if (state_change_cnt) {
6225                         *state_change_cnt = ctag->cfil_so_state_change_cnt;
6226                 }
6227                 if (options) {
6228                         *options = ctag->cfil_so_options;
6229                 }
6230                 if (faddr) {
6231                         *faddr = (struct sockaddr *) &ctag->cfil_faddr;
6232                 }
6233
6234                 /*
6235                  * Unlink tag and hand it over to caller.
6236                  * Note that caller will be responsible to free it.
6237                  */
6238                 m_tag_unlink(m, tag);
6239                 return tag;
6240         }
6241         return NULL;
6242 }