bsd/net/bpf.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1990, 1991, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from the Stanford/CMU enet packet filter,
  33  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  34  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  35  * Berkeley Laboratory.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)bpf.c       8.2 (Berkeley) 3/28/94
  66  *
  67  * $FreeBSD: src/sys/net/bpf.c,v 1.59.2.5 2001/01/05 04:49:09 jdp Exp $
  68  */
  69 /*
  70  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  71  * support for mandatory and extensible security protections.  This notice
  72  * is included in support of clause 2.2 (b) of the Apple Public License,
  73  * Version 2.0.
  74  */
  75
  76 #include "bpf.h"
  77
  78 #ifndef __GNUC__
  79 #define inline
  80 #else
  81 #define inline __inline
  82 #endif
  83
  84 #include <sys/param.h>
  85 #include <sys/systm.h>
  86 #include <sys/conf.h>
  87 #include <sys/malloc.h>
  88 #include <sys/mbuf.h>
  89 #include <sys/time.h>
  90 #include <sys/proc.h>
  91 #include <sys/signalvar.h>
  92 #include <sys/filio.h>
  93 #include <sys/sockio.h>
  94 #include <sys/ttycom.h>
  95 #include <sys/filedesc.h>
  96 #include <sys/uio_internal.h>
  97 #include <sys/file_internal.h>
  98 #include <sys/event.h>
  99
 100 #include <sys/poll.h>
 101
 102 #include <sys/socket.h>
 103 #include <sys/socketvar.h>
 104 #include <sys/vnode.h>
 105
 106 #include <net/if.h>
 107 #include <net/bpf.h>
 108 #include <net/bpfdesc.h>
 109
 110 #include <netinet/in.h>
 111 #include <netinet/ip.h>
 112 #include <netinet/ip6.h>
 113 #include <netinet/in_pcb.h>
 114 #include <netinet/in_var.h>
 115 #include <netinet/ip_var.h>
 116 #include <netinet/tcp.h>
 117 #include <netinet/tcp_var.h>
 118 #include <netinet/udp.h>
 119 #include <netinet/udp_var.h>
 120 #include <netinet/if_ether.h>
 121 #include <netinet/isakmp.h>
 122 #include <netinet6/esp.h>
 123 #include <sys/kernel.h>
 124 #include <sys/sysctl.h>
 125 #include <net/firewire.h>
 126
 127 #include <miscfs/devfs/devfs.h>
 128 #include <net/dlil.h>
 129 #include <net/pktap.h>
 130
 131 #include <kern/locks.h>
 132 #include <kern/thread_call.h>
 133 #include <libkern/section_keywords.h>
 134
 135 #if CONFIG_MACF_NET
 136 #include <security/mac_framework.h>
 137 #endif /* MAC_NET */
 138
 139 #include <os/log.h>
 140
 141 extern int tvtohz(struct timeval *);
 142
 143 #define BPF_BUFSIZE 4096
 144 #define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio)
 145
 146 #define PRINET  26                      /* interruptible */
 147
 148 #define ISAKMP_HDR_SIZE (sizeof(struct isakmp) + sizeof(struct isakmp_gen))
 149 #define ESP_HDR_SIZE sizeof(struct newesp)
 150
 151 typedef void (*pktcopyfunc_t)(const void *, void *, size_t);
 152
 153 /*
 154  * The default read buffer size is patchable.
 155  */
 156 static unsigned int bpf_bufsize = BPF_BUFSIZE;
 157 SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 158     &bpf_bufsize, 0, "");
 159
 160 static int sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS;
 161 extern const int copysize_limit_panic;
 162 #define BPF_MAXSIZE_CAP (copysize_limit_panic >> 1)
 163 __private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE;
 164 SYSCTL_PROC(_debug, OID_AUTO, bpf_maxbufsize, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 165     &bpf_maxbufsize, 0,
 166     sysctl_bpf_maxbufsize, "I", "Default BPF max buffer size");
 167
 168 static unsigned int bpf_maxdevices = 256;
 169 SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED,
 170     &bpf_maxdevices, 0, "");
 171 /*
 172  * bpf_wantpktap controls the defaul visibility of DLT_PKTAP
 173  * For OS X is off by default so process need to use the ioctl BPF_WANT_PKTAP
 174  * explicitly to be able to use DLT_PKTAP.
 175  */
 176 #if CONFIG_EMBEDDED
 177 static unsigned int bpf_wantpktap = 1;
 178 #else
 179 static unsigned int bpf_wantpktap = 0;
 180 #endif
 181 SYSCTL_UINT(_debug, OID_AUTO, bpf_wantpktap, CTLFLAG_RW | CTLFLAG_LOCKED,
 182     &bpf_wantpktap, 0, "");
 183
 184 static int bpf_debug = 0;
 185 SYSCTL_INT(_debug, OID_AUTO, bpf_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 186     &bpf_debug, 0, "");
 187
 188 /*
 189  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
 190  *  bpf_dtab holds pointer to the descriptors, indexed by minor device #
 191  */
 192 static struct bpf_if    *bpf_iflist;
 193 #ifdef __APPLE__
 194 /*
 195  * BSD now stores the bpf_d in the dev_t which is a struct
 196  * on their system. Our dev_t is an int, so we still store
 197  * the bpf_d in a separate table indexed by minor device #.
 198  *
 199  * The value stored in bpf_dtab[n] represent three states:
 200  *  NULL: device not opened
 201  *  BPF_DEV_RESERVED: device opening or closing
 202  *  other: device <n> opened with pointer to storage
 203  */
 204 #define BPF_DEV_RESERVED ((struct bpf_d *)(uintptr_t)1)
 205 static struct bpf_d     **bpf_dtab = NULL;
 206 static unsigned int bpf_dtab_size = 0;
 207 static unsigned int     nbpfilter = 0;
 208
 209 decl_lck_mtx_data(static, bpf_mlock_data);
 210 static lck_mtx_t                *bpf_mlock = &bpf_mlock_data;
 211 static lck_grp_t                *bpf_mlock_grp;
 212 static lck_grp_attr_t   *bpf_mlock_grp_attr;
 213 static lck_attr_t               *bpf_mlock_attr;
 214
 215 #endif /* __APPLE__ */
 216
 217 static int      bpf_allocbufs(struct bpf_d *);
 218 static errno_t  bpf_attachd(struct bpf_d *d, struct bpf_if *bp);
 219 static int      bpf_detachd(struct bpf_d *d, int);
 220 static void     bpf_freed(struct bpf_d *);
 221 static int      bpf_movein(struct uio *, int,
 222     struct mbuf **, struct sockaddr *, int *);
 223 static int      bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool);
 224 static void     bpf_timed_out(void *, void *);
 225 static void     bpf_wakeup(struct bpf_d *);
 226 static u_int    get_pkt_trunc_len(u_char *, u_int);
 227 static void     catchpacket(struct bpf_d *, struct bpf_packet *, u_int, int);
 228 static void     reset_d(struct bpf_d *);
 229 static int      bpf_setf(struct bpf_d *, u_int, user_addr_t, u_long);
 230 static int      bpf_getdltlist(struct bpf_d *, caddr_t, struct proc *);
 231 static int      bpf_setdlt(struct bpf_d *, u_int);
 232 static int      bpf_set_traffic_class(struct bpf_d *, int);
 233 static void     bpf_set_packet_service_class(struct mbuf *, int);
 234
 235 static void     bpf_acquire_d(struct bpf_d *);
 236 static void     bpf_release_d(struct bpf_d *);
 237
 238 static  int bpf_devsw_installed;
 239
 240 void bpf_init(void *unused);
 241 static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m);
 242
 243 /*
 244  * Darwin differs from BSD here, the following are static
 245  * on BSD and not static on Darwin.
 246  */
 247 d_open_t            bpfopen;
 248 d_close_t           bpfclose;
 249 d_read_t            bpfread;
 250 d_write_t           bpfwrite;
 251 ioctl_fcn_t         bpfioctl;
 252 select_fcn_t        bpfselect;
 253
 254 /* Darwin's cdevsw struct differs slightly from BSDs */
 255 #define CDEV_MAJOR 23
 256 static struct cdevsw bpf_cdevsw = {
 257         .d_open       = bpfopen,
 258         .d_close      = bpfclose,
 259         .d_read       = bpfread,
 260         .d_write      = bpfwrite,
 261         .d_ioctl      = bpfioctl,
 262         .d_stop       = eno_stop,
 263         .d_reset      = eno_reset,
 264         .d_ttys       = NULL,
 265         .d_select     = bpfselect,
 266         .d_mmap       = eno_mmap,
 267         .d_strategy   = eno_strat,
 268         .d_reserved_1 = eno_getc,
 269         .d_reserved_2 = eno_putc,
 270         .d_type       = 0
 271 };
 272
 273 #define SOCKADDR_HDR_LEN           offsetof(struct sockaddr, sa_data)
 274
 275 static int
 276 bpf_movein(struct uio *uio, int linktype, struct mbuf **mp,
 277     struct sockaddr *sockp, int *datlen)
 278 {
 279         struct mbuf *m;
 280         int error;
 281         int len;
 282         uint8_t sa_family;
 283         int hlen;
 284
 285         switch (linktype) {
 286 #if SLIP
 287         case DLT_SLIP:
 288                 sa_family = AF_INET;
 289                 hlen = 0;
 290                 break;
 291 #endif /* SLIP */
 292
 293         case DLT_EN10MB:
 294                 sa_family = AF_UNSPEC;
 295                 /* XXX Would MAXLINKHDR be better? */
 296                 hlen = sizeof(struct ether_header);
 297                 break;
 298
 299 #if FDDI
 300         case DLT_FDDI:
 301 #if defined(__FreeBSD__) || defined(__bsdi__)
 302                 sa_family = AF_IMPLINK;
 303                 hlen = 0;
 304 #else
 305                 sa_family = AF_UNSPEC;
 306                 /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */
 307                 hlen = 24;
 308 #endif
 309                 break;
 310 #endif /* FDDI */
 311
 312         case DLT_RAW:
 313         case DLT_NULL:
 314                 sa_family = AF_UNSPEC;
 315                 hlen = 0;
 316                 break;
 317
 318 #ifdef __FreeBSD__
 319         case DLT_ATM_RFC1483:
 320                 /*
 321                  * en atm driver requires 4-byte atm pseudo header.
 322                  * though it isn't standard, vpi:vci needs to be
 323                  * specified anyway.
 324                  */
 325                 sa_family = AF_UNSPEC;
 326                 hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 327                 break;
 328 #endif
 329
 330         case DLT_PPP:
 331                 sa_family = AF_UNSPEC;
 332                 hlen = 4;       /* This should match PPP_HDRLEN */
 333                 break;
 334
 335         case DLT_APPLE_IP_OVER_IEEE1394:
 336                 sa_family = AF_UNSPEC;
 337                 hlen = sizeof(struct firewire_header);
 338                 break;
 339
 340         case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
 341                 sa_family = AF_IEEE80211;
 342                 hlen = 0;
 343                 break;
 344
 345         case DLT_IEEE802_11_RADIO:
 346                 sa_family = AF_IEEE80211;
 347                 hlen = 0;
 348                 break;
 349
 350         default:
 351                 return EIO;
 352         }
 353
 354         // LP64todo - fix this!
 355         len = uio_resid(uio);
 356         *datlen = len - hlen;
 357         if ((unsigned)len > MCLBYTES) {
 358                 return EIO;
 359         }
 360
 361         if (sockp) {
 362                 /*
 363                  * Build a sockaddr based on the data link layer type.
 364                  * We do this at this level because the ethernet header
 365                  * is copied directly into the data field of the sockaddr.
 366                  * In the case of SLIP, there is no header and the packet
 367                  * is forwarded as is.
 368                  * Also, we are careful to leave room at the front of the mbuf
 369                  * for the link level header.
 370                  */
 371                 if ((hlen + SOCKADDR_HDR_LEN) > sockp->sa_len) {
 372                         return EIO;
 373                 }
 374                 sockp->sa_family = sa_family;
 375         } else {
 376                 /*
 377                  * We're directly sending the packet data supplied by
 378                  * the user; we don't need to make room for the link
 379                  * header, and don't need the header length value any
 380                  * more, so set it to 0.
 381                  */
 382                 hlen = 0;
 383         }
 384
 385         MGETHDR(m, M_WAIT, MT_DATA);
 386         if (m == 0) {
 387                 return ENOBUFS;
 388         }
 389         if ((unsigned)len > MHLEN) {
 390                 MCLGET(m, M_WAIT);
 391                 if ((m->m_flags & M_EXT) == 0) {
 392                         error = ENOBUFS;
 393                         goto bad;
 394                 }
 395         }
 396         m->m_pkthdr.len = m->m_len = len;
 397         m->m_pkthdr.rcvif = NULL;
 398         *mp = m;
 399
 400         /*
 401          * Make room for link header.
 402          */
 403         if (hlen != 0) {
 404                 m->m_pkthdr.len -= hlen;
 405                 m->m_len -= hlen;
 406                 m->m_data += hlen; /* XXX */
 407                 error = UIOMOVE((caddr_t)sockp->sa_data, hlen, UIO_WRITE, uio);
 408                 if (error) {
 409                         goto bad;
 410                 }
 411         }
 412         error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio);
 413         if (error) {
 414                 goto bad;
 415         }
 416
 417         /* Check for multicast destination */
 418         switch (linktype) {
 419         case DLT_EN10MB: {
 420                 struct ether_header *eh;
 421
 422                 eh = mtod(m, struct ether_header *);
 423                 if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 424                         if (_ether_cmp(etherbroadcastaddr,
 425                             eh->ether_dhost) == 0) {
 426                                 m->m_flags |= M_BCAST;
 427                         } else {
 428                                 m->m_flags |= M_MCAST;
 429                         }
 430                 }
 431                 break;
 432         }
 433         }
 434
 435         return 0;
 436 bad:
 437         m_freem(m);
 438         return error;
 439 }
 440
 441 #ifdef __APPLE__
 442
 443 /*
 444  * The dynamic addition of a new device node must block all processes that
 445  * are opening the last device so that no process will get an unexpected
 446  * ENOENT
 447  */
 448 static void
 449 bpf_make_dev_t(int maj)
 450 {
 451         static int              bpf_growing = 0;
 452         unsigned int    cur_size = nbpfilter, i;
 453
 454         if (nbpfilter >= bpf_maxdevices) {
 455                 return;
 456         }
 457
 458         while (bpf_growing) {
 459                 /* Wait until new device has been created */
 460                 (void) tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0);
 461         }
 462         if (nbpfilter > cur_size) {
 463                 /* other thread grew it already */
 464                 return;
 465         }
 466         bpf_growing = 1;
 467
 468         /* need to grow bpf_dtab first */
 469         if (nbpfilter == bpf_dtab_size) {
 470                 int new_dtab_size;
 471                 struct bpf_d **new_dtab = NULL;
 472                 struct bpf_d **old_dtab = NULL;
 473
 474                 new_dtab_size = bpf_dtab_size + NBPFILTER;
 475                 new_dtab = (struct bpf_d **)_MALLOC(
 476                         sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT);
 477                 if (new_dtab == 0) {
 478                         printf("bpf_make_dev_t: malloc bpf_dtab failed\n");
 479                         goto done;
 480                 }
 481                 if (bpf_dtab) {
 482                         bcopy(bpf_dtab, new_dtab,
 483                             sizeof(struct bpf_d *) * bpf_dtab_size);
 484                 }
 485                 bzero(new_dtab + bpf_dtab_size,
 486                     sizeof(struct bpf_d *) * NBPFILTER);
 487                 old_dtab = bpf_dtab;
 488                 bpf_dtab = new_dtab;
 489                 bpf_dtab_size = new_dtab_size;
 490                 if (old_dtab != NULL) {
 491                         _FREE(old_dtab, M_DEVBUF);
 492                 }
 493         }
 494         i = nbpfilter++;
 495         (void) devfs_make_node(makedev(maj, i),
 496             DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600,
 497             "bpf%d", i);
 498 done:
 499         bpf_growing = 0;
 500         wakeup((caddr_t)&bpf_growing);
 501 }
 502
 503 #endif
 504
 505 /*
 506  * Attach file to the bpf interface, i.e. make d listen on bp.
 507  */
 508 static errno_t
 509 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 510 {
 511         int first = bp->bif_dlist == NULL;
 512         int     error = 0;
 513
 514         /*
 515          * Point d at bp, and add d to the interface's list of listeners.
 516          * Finally, point the driver's bpf cookie at the interface so
 517          * it will divert packets to bpf.
 518          */
 519         d->bd_bif = bp;
 520         d->bd_next = bp->bif_dlist;
 521         bp->bif_dlist = d;
 522
 523         /*
 524          * Take a reference on the device even if an error is returned
 525          * because we keep the device in the interface's list of listeners
 526          */
 527         bpf_acquire_d(d);
 528
 529         if (first) {
 530                 /* Find the default bpf entry for this ifp */
 531                 if (bp->bif_ifp->if_bpf == NULL) {
 532                         struct bpf_if   *tmp, *primary = NULL;
 533
 534                         for (tmp = bpf_iflist; tmp; tmp = tmp->bif_next) {
 535                                 if (tmp->bif_ifp == bp->bif_ifp) {
 536                                         primary = tmp;
 537                                         break;
 538                                 }
 539                         }
 540                         bp->bif_ifp->if_bpf = primary;
 541                 }
 542                 /* Only call dlil_set_bpf_tap for primary dlt */
 543                 if (bp->bif_ifp->if_bpf == bp) {
 544                         dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT,
 545                             bpf_tap_callback);
 546                 }
 547
 548                 if (bp->bif_tap != NULL) {
 549                         error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt,
 550                             BPF_TAP_INPUT_OUTPUT);
 551                 }
 552         }
 553
 554         /*
 555          * Reset the detach flags in case we previously detached an interface
 556          */
 557         d->bd_flags &= ~(BPF_DETACHING | BPF_DETACHED);
 558
 559         if (bp->bif_dlt == DLT_PKTAP) {
 560                 d->bd_flags |= BPF_FINALIZE_PKTAP;
 561         } else {
 562                 d->bd_flags &= ~BPF_FINALIZE_PKTAP;
 563         }
 564         return error;
 565 }
 566
 567 /*
 568  * Detach a file from its interface.
 569  *
 570  * Return 1 if was closed by some thread, 0 otherwise
 571  */
 572 static int
 573 bpf_detachd(struct bpf_d *d, int closing)
 574 {
 575         struct bpf_d **p;
 576         struct bpf_if *bp;
 577         struct ifnet  *ifp;
 578
 579         int bpf_closed = d->bd_flags & BPF_CLOSING;
 580         /*
 581          * Some other thread already detached
 582          */
 583         if ((d->bd_flags & (BPF_DETACHED | BPF_DETACHING)) != 0) {
 584                 goto done;
 585         }
 586         /*
 587          * This thread is doing the detach
 588          */
 589         d->bd_flags |= BPF_DETACHING;
 590
 591         ifp = d->bd_bif->bif_ifp;
 592         bp = d->bd_bif;
 593
 594         if (bpf_debug != 0) {
 595                 printf("%s: %llx %s%s\n",
 596                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d),
 597                     if_name(ifp), closing ? " closing" : "");
 598         }
 599
 600         /* Remove d from the interface's descriptor list. */
 601         p = &bp->bif_dlist;
 602         while (*p != d) {
 603                 p = &(*p)->bd_next;
 604                 if (*p == 0) {
 605                         panic("bpf_detachd: descriptor not in list");
 606                 }
 607         }
 608         *p = (*p)->bd_next;
 609         if (bp->bif_dlist == 0) {
 610                 /*
 611                  * Let the driver know that there are no more listeners.
 612                  */
 613                 /* Only call dlil_set_bpf_tap for primary dlt */
 614                 if (bp->bif_ifp->if_bpf == bp) {
 615                         dlil_set_bpf_tap(ifp, BPF_TAP_DISABLE, NULL);
 616                 }
 617                 if (bp->bif_tap) {
 618                         bp->bif_tap(ifp, bp->bif_dlt, BPF_TAP_DISABLE);
 619                 }
 620
 621                 for (bp = bpf_iflist; bp; bp = bp->bif_next) {
 622                         if (bp->bif_ifp == ifp && bp->bif_dlist != 0) {
 623                                 break;
 624                         }
 625                 }
 626                 if (bp == NULL) {
 627                         ifp->if_bpf = NULL;
 628                 }
 629         }
 630         d->bd_bif = NULL;
 631         /*
 632          * Check if this descriptor had requested promiscuous mode.
 633          * If so, turn it off.
 634          */
 635         if (d->bd_promisc) {
 636                 d->bd_promisc = 0;
 637                 lck_mtx_unlock(bpf_mlock);
 638                 if (ifnet_set_promiscuous(ifp, 0)) {
 639                         /*
 640                          * Something is really wrong if we were able to put
 641                          * the driver into promiscuous mode, but can't
 642                          * take it out.
 643                          * Most likely the network interface is gone.
 644                          */
 645                         printf("%s: ifnet_set_promiscuous failed\n", __func__);
 646                 }
 647                 lck_mtx_lock(bpf_mlock);
 648         }
 649
 650         /*
 651          * Wake up other thread that are waiting for this thread to finish
 652          * detaching
 653          */
 654         d->bd_flags &= ~BPF_DETACHING;
 655         d->bd_flags |= BPF_DETACHED;
 656
 657         /* Refresh the local variable as d could have been modified */
 658         bpf_closed = d->bd_flags & BPF_CLOSING;
 659         /*
 660          * Note that We've kept the reference because we may have dropped
 661          * the lock when turning off promiscuous mode
 662          */
 663         bpf_release_d(d);
 664
 665 done:
 666         /*
 667          * When closing makes sure no other thread refer to the bpf_d
 668          */
 669         if (bpf_debug != 0) {
 670                 printf("%s: %llx done\n",
 671                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
 672         }
 673         /*
 674          * Let the caller know the bpf_d is closed
 675          */
 676         if (bpf_closed) {
 677                 return 1;
 678         } else {
 679                 return 0;
 680         }
 681 }
 682
 683 /*
 684  * Start asynchronous timer, if necessary.
 685  * Must be called with bpf_mlock held.
 686  */
 687 static void
 688 bpf_start_timer(struct bpf_d *d)
 689 {
 690         uint64_t deadline;
 691         struct timeval tv;
 692
 693         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 694                 tv.tv_sec = d->bd_rtout / hz;
 695                 tv.tv_usec = (d->bd_rtout % hz) * tick;
 696
 697                 clock_interval_to_deadline(
 698                         (uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec,
 699                         NSEC_PER_USEC, &deadline);
 700                 /*
 701                  * The state is BPF_IDLE, so the timer hasn't
 702                  * been started yet, and hasn't gone off yet;
 703                  * there is no thread call scheduled, so this
 704                  * won't change the schedule.
 705                  *
 706                  * XXX - what if, by the time it gets entered,
 707                  * the deadline has already passed?
 708                  */
 709                 thread_call_enter_delayed(d->bd_thread_call, deadline);
 710                 d->bd_state = BPF_WAITING;
 711         }
 712 }
 713
 714 /*
 715  * Cancel asynchronous timer.
 716  * Must be called with bpf_mlock held.
 717  */
 718 static boolean_t
 719 bpf_stop_timer(struct bpf_d *d)
 720 {
 721         /*
 722          * If the timer has already gone off, this does nothing.
 723          * Our caller is expected to set d->bd_state to BPF_IDLE,
 724          * with the bpf_mlock, after we are called. bpf_timed_out()
 725          * also grabs bpf_mlock, so, if the timer has gone off and
 726          * bpf_timed_out() hasn't finished, it's waiting for the
 727          * lock; when this thread releases the lock, it will
 728          * find the state is BPF_IDLE, and just release the
 729          * lock and return.
 730          */
 731         return thread_call_cancel(d->bd_thread_call);
 732 }
 733
 734 void
 735 bpf_acquire_d(struct bpf_d *d)
 736 {
 737         void *lr_saved =  __builtin_return_address(0);
 738
 739         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
 740
 741         d->bd_refcnt += 1;
 742
 743         d->bd_ref_lr[d->bd_next_ref_lr] = lr_saved;
 744         d->bd_next_ref_lr = (d->bd_next_ref_lr + 1) % BPF_REF_HIST;
 745 }
 746
 747 void
 748 bpf_release_d(struct bpf_d *d)
 749 {
 750         void *lr_saved =  __builtin_return_address(0);
 751
 752         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
 753
 754         if (d->bd_refcnt <= 0) {
 755                 panic("%s: %p refcnt <= 0", __func__, d);
 756         }
 757
 758         d->bd_refcnt -= 1;
 759
 760         d->bd_unref_lr[d->bd_next_unref_lr] = lr_saved;
 761         d->bd_next_unref_lr = (d->bd_next_unref_lr + 1) % BPF_REF_HIST;
 762
 763         if (d->bd_refcnt == 0) {
 764                 /* Assert the device is detached */
 765                 if ((d->bd_flags & BPF_DETACHED) == 0) {
 766                         panic("%s: %p BPF_DETACHED not set", __func__, d);
 767                 }
 768
 769                 _FREE(d, M_DEVBUF);
 770         }
 771 }
 772
 773 /*
 774  * Open ethernet device.  Returns ENXIO for illegal minor device number,
 775  * EBUSY if file is open by another process.
 776  */
 777 /* ARGSUSED */
 778 int
 779 bpfopen(dev_t dev, int flags, __unused int fmt,
 780     struct proc *p)
 781 {
 782         struct bpf_d *d;
 783
 784         lck_mtx_lock(bpf_mlock);
 785         if ((unsigned int) minor(dev) >= nbpfilter) {
 786                 lck_mtx_unlock(bpf_mlock);
 787                 return ENXIO;
 788         }
 789         /*
 790          * New device nodes are created on demand when opening the last one.
 791          * The programming model is for processes to loop on the minor starting
 792          * at 0 as long as EBUSY is returned. The loop stops when either the
 793          * open succeeds or an error other that EBUSY is returned. That means
 794          * that bpf_make_dev_t() must block all processes that are opening the
 795          * last  node. If not all processes are blocked, they could unexpectedly
 796          * get ENOENT and abort their opening loop.
 797          */
 798         if ((unsigned int) minor(dev) == (nbpfilter - 1)) {
 799                 bpf_make_dev_t(major(dev));
 800         }
 801
 802         /*
 803          * Each minor can be opened by only one process.  If the requested
 804          * minor is in use, return EBUSY.
 805          *
 806          * Important: bpfopen() and bpfclose() have to check and set the status
 807          * of a device in the same lockin context otherwise the device may be
 808          * leaked because the vnode use count will be unpextectly greater than 1
 809          * when close() is called.
 810          */
 811         if (bpf_dtab[minor(dev)] == NULL) {
 812                 /* Reserve while opening */
 813                 bpf_dtab[minor(dev)] = BPF_DEV_RESERVED;
 814         } else {
 815                 lck_mtx_unlock(bpf_mlock);
 816                 return EBUSY;
 817         }
 818         d = (struct bpf_d *)_MALLOC(sizeof(struct bpf_d), M_DEVBUF,
 819             M_WAIT | M_ZERO);
 820         if (d == NULL) {
 821                 /* this really is a catastrophic failure */
 822                 printf("bpfopen: malloc bpf_d failed\n");
 823                 bpf_dtab[minor(dev)] = NULL;
 824                 lck_mtx_unlock(bpf_mlock);
 825                 return ENOMEM;
 826         }
 827
 828         /* Mark "in use" and do most initialization. */
 829         bpf_acquire_d(d);
 830         d->bd_bufsize = bpf_bufsize;
 831         d->bd_sig = SIGIO;
 832         d->bd_seesent = 1;
 833         d->bd_oflags = flags;
 834         d->bd_state = BPF_IDLE;
 835         d->bd_traffic_class = SO_TC_BE;
 836         d->bd_flags |= BPF_DETACHED;
 837         if (bpf_wantpktap) {
 838                 d->bd_flags |= BPF_WANT_PKTAP;
 839         } else {
 840                 d->bd_flags &= ~BPF_WANT_PKTAP;
 841         }
 842         d->bd_thread_call = thread_call_allocate(bpf_timed_out, d);
 843         if (d->bd_thread_call == NULL) {
 844                 printf("bpfopen: malloc thread call failed\n");
 845                 bpf_dtab[minor(dev)] = NULL;
 846                 bpf_release_d(d);
 847                 lck_mtx_unlock(bpf_mlock);
 848
 849                 return ENOMEM;
 850         }
 851         d->bd_opened_by = p;
 852         uuid_generate(d->bd_uuid);
 853
 854 #if CONFIG_MACF_NET
 855         mac_bpfdesc_label_init(d);
 856         mac_bpfdesc_label_associate(kauth_cred_get(), d);
 857 #endif
 858         bpf_dtab[minor(dev)] = d; /* Mark opened */
 859         lck_mtx_unlock(bpf_mlock);
 860
 861         return 0;
 862 }
 863
 864 /*
 865  * Close the descriptor by detaching it from its interface,
 866  * deallocating its buffers, and marking it free.
 867  */
 868 /* ARGSUSED */
 869 int
 870 bpfclose(dev_t dev, __unused int flags, __unused int fmt,
 871     __unused struct proc *p)
 872 {
 873         struct bpf_d *d;
 874
 875         /* Take BPF lock to ensure no other thread is using the device */
 876         lck_mtx_lock(bpf_mlock);
 877
 878         d = bpf_dtab[minor(dev)];
 879         if (d == NULL || d == BPF_DEV_RESERVED) {
 880                 lck_mtx_unlock(bpf_mlock);
 881                 return ENXIO;
 882         }
 883
 884         /*
 885          * Other threads may call bpd_detachd() if we drop the bpf_mlock
 886          */
 887         d->bd_flags |= BPF_CLOSING;
 888
 889         if (bpf_debug != 0) {
 890                 printf("%s: %llx\n",
 891                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
 892         }
 893
 894         bpf_dtab[minor(dev)] = BPF_DEV_RESERVED; /* Reserve while closing */
 895
 896         /*
 897          * Deal with any in-progress timeouts.
 898          */
 899         switch (d->bd_state) {
 900         case BPF_IDLE:
 901                 /*
 902                  * Not waiting for a timeout, and no timeout happened.
 903                  */
 904                 break;
 905
 906         case BPF_WAITING:
 907                 /*
 908                  * Waiting for a timeout.
 909                  * Cancel any timer that has yet to go off,
 910                  * and mark the state as "closing".
 911                  * Then drop the lock to allow any timers that
 912                  * *have* gone off to run to completion, and wait
 913                  * for them to finish.
 914                  */
 915                 if (!bpf_stop_timer(d)) {
 916                         /*
 917                          * There was no pending call, so the call must
 918                          * have been in progress. Wait for the call to
 919                          * complete; we have to drop the lock while
 920                          * waiting. to let the in-progrss call complete
 921                          */
 922                         d->bd_state = BPF_DRAINING;
 923                         while (d->bd_state == BPF_DRAINING) {
 924                                 msleep((caddr_t)d, bpf_mlock, PRINET,
 925                                     "bpfdraining", NULL);
 926                         }
 927                 }
 928                 d->bd_state = BPF_IDLE;
 929                 break;
 930
 931         case BPF_TIMED_OUT:
 932                 /*
 933                  * Timer went off, and the timeout routine finished.
 934                  */
 935                 d->bd_state = BPF_IDLE;
 936                 break;
 937
 938         case BPF_DRAINING:
 939                 /*
 940                  * Another thread is blocked on a close waiting for
 941                  * a timeout to finish.
 942                  * This "shouldn't happen", as the first thread to enter
 943                  * bpfclose() will set bpf_dtab[minor(dev)] to 1, and
 944                  * all subsequent threads should see that and fail with
 945                  * ENXIO.
 946                  */
 947                 panic("Two threads blocked in a BPF close");
 948                 break;
 949         }
 950
 951         if (d->bd_bif) {
 952                 bpf_detachd(d, 1);
 953         }
 954         selthreadclear(&d->bd_sel);
 955 #if CONFIG_MACF_NET
 956         mac_bpfdesc_label_destroy(d);
 957 #endif
 958         thread_call_free(d->bd_thread_call);
 959
 960         while (d->bd_hbuf_read != 0) {
 961                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 962         }
 963
 964         bpf_freed(d);
 965
 966         /* Mark free in same context as bpfopen comes to check */
 967         bpf_dtab[minor(dev)] = NULL;                    /* Mark closed */
 968
 969         bpf_release_d(d);
 970
 971         lck_mtx_unlock(bpf_mlock);
 972
 973         return 0;
 974 }
 975
 976 #define BPF_SLEEP bpf_sleep
 977
 978 static int
 979 bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo)
 980 {
 981         u_int64_t abstime = 0;
 982
 983         if (timo != 0) {
 984                 clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime);
 985         }
 986
 987         return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime);
 988 }
 989
 990 static void
 991 bpf_finalize_pktap(struct bpf_hdr *hp, struct pktap_header *pktaphdr)
 992 {
 993         if (pktaphdr->pth_flags & PTH_FLAG_V2_HDR) {
 994                 struct pktap_v2_hdr *pktap_v2_hdr;
 995
 996                 pktap_v2_hdr = (struct pktap_v2_hdr *)pktaphdr;
 997
 998                 if (pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP) {
 999                         pktap_v2_finalize_proc_info(pktap_v2_hdr);
1000                 }
1001         } else {
1002                 if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP) {
1003                         pktap_finalize_proc_info(pktaphdr);
1004                 }
1005
1006                 if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) {
1007                         hp->bh_tstamp.tv_sec = pktaphdr->pth_tstamp.tv_sec;
1008                         hp->bh_tstamp.tv_usec = pktaphdr->pth_tstamp.tv_usec;
1009                 }
1010         }
1011 }
1012
1013 /*
1014  * Rotate the packet buffers in descriptor d.  Move the store buffer
1015  * into the hold slot, and the free buffer into the store slot.
1016  * Zero the length of the new store buffer.
1017  */
1018 #define ROTATE_BUFFERS(d) \
1019         if (d->bd_hbuf_read != 0) \
1020                 panic("rotating bpf buffers during read"); \
1021         (d)->bd_hbuf = (d)->bd_sbuf; \
1022         (d)->bd_hlen = (d)->bd_slen; \
1023         (d)->bd_hcnt = (d)->bd_scnt; \
1024         (d)->bd_sbuf = (d)->bd_fbuf; \
1025         (d)->bd_slen = 0; \
1026         (d)->bd_scnt = 0; \
1027         (d)->bd_fbuf = NULL;
1028 /*
1029  *  bpfread - read next chunk of packets from buffers
1030  */
1031 int
1032 bpfread(dev_t dev, struct uio *uio, int ioflag)
1033 {
1034         struct bpf_d *d;
1035         caddr_t hbuf;
1036         int timed_out, hbuf_len;
1037         int error;
1038         int flags;
1039
1040         lck_mtx_lock(bpf_mlock);
1041
1042         d = bpf_dtab[minor(dev)];
1043         if (d == NULL || d == BPF_DEV_RESERVED ||
1044             (d->bd_flags & BPF_CLOSING) != 0) {
1045                 lck_mtx_unlock(bpf_mlock);
1046                 return ENXIO;
1047         }
1048
1049         bpf_acquire_d(d);
1050
1051         /*
1052          * Restrict application to use a buffer the same size as
1053          * as kernel buffers.
1054          */
1055         if (uio_resid(uio) != d->bd_bufsize) {
1056                 bpf_release_d(d);
1057                 lck_mtx_unlock(bpf_mlock);
1058                 return EINVAL;
1059         }
1060
1061         if (d->bd_state == BPF_WAITING) {
1062                 bpf_stop_timer(d);
1063         }
1064
1065         timed_out = (d->bd_state == BPF_TIMED_OUT);
1066         d->bd_state = BPF_IDLE;
1067
1068         while (d->bd_hbuf_read != 0) {
1069                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
1070         }
1071
1072         if ((d->bd_flags & BPF_CLOSING) != 0) {
1073                 bpf_release_d(d);
1074                 lck_mtx_unlock(bpf_mlock);
1075                 return ENXIO;
1076         }
1077         /*
1078          * If the hold buffer is empty, then do a timed sleep, which
1079          * ends when the timeout expires or when enough packets
1080          * have arrived to fill the store buffer.
1081          */
1082         while (d->bd_hbuf == 0) {
1083                 if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) &&
1084                     d->bd_slen != 0) {
1085                         /*
1086                          * We're in immediate mode, or are reading
1087                          * in non-blocking mode, or a timer was
1088                          * started before the read (e.g., by select()
1089                          * or poll()) and has expired and a packet(s)
1090                          * either arrived since the previous
1091                          * read or arrived while we were asleep.
1092                          * Rotate the buffers and return what's here.
1093                          */
1094                         ROTATE_BUFFERS(d);
1095                         break;
1096                 }
1097
1098                 /*
1099                  * No data is available, check to see if the bpf device
1100                  * is still pointed at a real interface.  If not, return
1101                  * ENXIO so that the userland process knows to rebind
1102                  * it before using it again.
1103                  */
1104                 if (d->bd_bif == NULL) {
1105                         bpf_release_d(d);
1106                         lck_mtx_unlock(bpf_mlock);
1107                         return ENXIO;
1108                 }
1109                 if (ioflag & IO_NDELAY) {
1110                         bpf_release_d(d);
1111                         lck_mtx_unlock(bpf_mlock);
1112                         return EWOULDBLOCK;
1113                 }
1114                 error = BPF_SLEEP(d, PRINET | PCATCH, "bpf", d->bd_rtout);
1115                 /*
1116                  * Make sure device is still opened
1117                  */
1118                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1119                         bpf_release_d(d);
1120                         lck_mtx_unlock(bpf_mlock);
1121                         return ENXIO;
1122                 }
1123
1124                 while (d->bd_hbuf_read != 0) {
1125                         msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
1126                             NULL);
1127                 }
1128
1129                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1130                         bpf_release_d(d);
1131                         lck_mtx_unlock(bpf_mlock);
1132                         return ENXIO;
1133                 }
1134
1135                 if (error == EINTR || error == ERESTART) {
1136                         if (d->bd_hbuf != NULL) {
1137                                 /*
1138                                  * Because we msleep, the hold buffer might
1139                                  * be filled when we wake up.  Avoid rotating
1140                                  * in this case.
1141                                  */
1142                                 break;
1143                         }
1144                         if (d->bd_slen != 0) {
1145                                 /*
1146                                  * Sometimes we may be interrupted often and
1147                                  * the sleep above will not timeout.
1148                                  * Regardless, we should rotate the buffers
1149                                  * if there's any new data pending and
1150                                  * return it.
1151                                  */
1152                                 ROTATE_BUFFERS(d);
1153                                 break;
1154                         }
1155                         bpf_release_d(d);
1156                         lck_mtx_unlock(bpf_mlock);
1157                         if (error == ERESTART) {
1158                                 printf("%s: %llx ERESTART to EINTR\n",
1159                                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
1160                                 error = EINTR;
1161                         }
1162                         return error;
1163                 }
1164                 if (error == EWOULDBLOCK) {
1165                         /*
1166                          * On a timeout, return what's in the buffer,
1167                          * which may be nothing.  If there is something
1168                          * in the store buffer, we can rotate the buffers.
1169                          */
1170                         if (d->bd_hbuf) {
1171                                 /*
1172                                  * We filled up the buffer in between
1173                                  * getting the timeout and arriving
1174                                  * here, so we don't need to rotate.
1175                                  */
1176                                 break;
1177                         }
1178
1179                         if (d->bd_slen == 0) {
1180                                 bpf_release_d(d);
1181                                 lck_mtx_unlock(bpf_mlock);
1182                                 return 0;
1183                         }
1184                         ROTATE_BUFFERS(d);
1185                         break;
1186                 }
1187         }
1188         /*
1189          * At this point, we know we have something in the hold slot.
1190          */
1191
1192         /*
1193          * Set the hold buffer read. So we do not
1194          * rotate the buffers until the hold buffer
1195          * read is complete. Also to avoid issues resulting
1196          * from page faults during disk sleep (<rdar://problem/13436396>).
1197          */
1198         d->bd_hbuf_read = 1;
1199         hbuf = d->bd_hbuf;
1200         hbuf_len = d->bd_hlen;
1201         flags = d->bd_flags;
1202         lck_mtx_unlock(bpf_mlock);
1203
1204 #ifdef __APPLE__
1205         /*
1206          * Before we move data to userland, we fill out the extended
1207          * header fields.
1208          */
1209         if (flags & BPF_EXTENDED_HDR) {
1210                 char *p;
1211
1212                 p = hbuf;
1213                 while (p < hbuf + hbuf_len) {
1214                         struct bpf_hdr_ext *ehp;
1215                         uint32_t flowid;
1216                         struct so_procinfo soprocinfo;
1217                         int found = 0;
1218
1219                         ehp = (struct bpf_hdr_ext *)(void *)p;
1220                         if ((flowid = ehp->bh_flowid) != 0) {
1221                                 if (ehp->bh_proto == IPPROTO_TCP) {
1222                                         found = inp_findinpcb_procinfo(&tcbinfo,
1223                                             flowid, &soprocinfo);
1224                                 } else if (ehp->bh_proto == IPPROTO_UDP) {
1225                                         found = inp_findinpcb_procinfo(&udbinfo,
1226                                             flowid, &soprocinfo);
1227                                 }
1228                                 if (found == 1) {
1229                                         ehp->bh_pid = soprocinfo.spi_pid;
1230                                         strlcpy(&ehp->bh_comm[0], &soprocinfo.spi_proc_name[0], sizeof(ehp->bh_comm));
1231                                 }
1232                                 ehp->bh_flowid = 0;
1233                         }
1234
1235                         if (flags & BPF_FINALIZE_PKTAP) {
1236                                 struct pktap_header *pktaphdr;
1237
1238                                 pktaphdr = (struct pktap_header *)(void *)
1239                                     (p + BPF_WORDALIGN(ehp->bh_hdrlen));
1240
1241                                 bpf_finalize_pktap((struct bpf_hdr *) ehp,
1242                                     pktaphdr);
1243                         }
1244                         p += BPF_WORDALIGN(ehp->bh_hdrlen + ehp->bh_caplen);
1245                 }
1246         } else if (flags & BPF_FINALIZE_PKTAP) {
1247                 char *p;
1248
1249                 p = hbuf;
1250                 while (p < hbuf + hbuf_len) {
1251                         struct bpf_hdr *hp;
1252                         struct pktap_header *pktaphdr;
1253
1254                         hp = (struct bpf_hdr *)(void *)p;
1255                         pktaphdr = (struct pktap_header *)(void *)
1256                             (p + BPF_WORDALIGN(hp->bh_hdrlen));
1257
1258                         bpf_finalize_pktap(hp, pktaphdr);
1259
1260                         p += BPF_WORDALIGN(hp->bh_hdrlen + hp->bh_caplen);
1261                 }
1262         }
1263 #endif
1264
1265         /*
1266          * Move data from hold buffer into user space.
1267          * We know the entire buffer is transferred since
1268          * we checked above that the read buffer is bpf_bufsize bytes.
1269          */
1270         error = UIOMOVE(hbuf, hbuf_len, UIO_READ, uio);
1271
1272         lck_mtx_lock(bpf_mlock);
1273         /*
1274          * Make sure device is still opened
1275          */
1276         if ((d->bd_flags & BPF_CLOSING) != 0) {
1277                 bpf_release_d(d);
1278                 lck_mtx_unlock(bpf_mlock);
1279                 return ENXIO;
1280         }
1281
1282         d->bd_hbuf_read = 0;
1283         d->bd_fbuf = d->bd_hbuf;
1284         d->bd_hbuf = NULL;
1285         d->bd_hlen = 0;
1286         d->bd_hcnt = 0;
1287         wakeup((caddr_t)d);
1288
1289         bpf_release_d(d);
1290         lck_mtx_unlock(bpf_mlock);
1291         return error;
1292 }
1293
1294 /*
1295  * If there are processes sleeping on this descriptor, wake them up.
1296  */
1297 static void
1298 bpf_wakeup(struct bpf_d *d)
1299 {
1300         if (d->bd_state == BPF_WAITING) {
1301                 bpf_stop_timer(d);
1302                 d->bd_state = BPF_IDLE;
1303         }
1304         wakeup((caddr_t)d);
1305         if (d->bd_async && d->bd_sig && d->bd_sigio) {
1306                 pgsigio(d->bd_sigio, d->bd_sig);
1307         }
1308
1309         selwakeup(&d->bd_sel);
1310         if ((d->bd_flags & BPF_KNOTE)) {
1311                 KNOTE(&d->bd_sel.si_note, 1);
1312         }
1313 }
1314
1315 static void
1316 bpf_timed_out(void *arg, __unused void *dummy)
1317 {
1318         struct bpf_d *d = (struct bpf_d *)arg;
1319
1320         lck_mtx_lock(bpf_mlock);
1321         if (d->bd_state == BPF_WAITING) {
1322                 /*
1323                  * There's a select or kqueue waiting for this; if there's
1324                  * now stuff to read, wake it up.
1325                  */
1326                 d->bd_state = BPF_TIMED_OUT;
1327                 if (d->bd_slen != 0) {
1328                         bpf_wakeup(d);
1329                 }
1330         } else if (d->bd_state == BPF_DRAINING) {
1331                 /*
1332                  * A close is waiting for this to finish.
1333                  * Mark it as finished, and wake the close up.
1334                  */
1335                 d->bd_state = BPF_IDLE;
1336                 bpf_wakeup(d);
1337         }
1338         lck_mtx_unlock(bpf_mlock);
1339 }
1340
1341 /* keep in sync with bpf_movein above: */
1342 #define MAX_DATALINK_HDR_LEN    (sizeof(struct firewire_header))
1343
1344 int
1345 bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
1346 {
1347         struct bpf_d *d;
1348         struct ifnet *ifp;
1349         struct mbuf *m = NULL;
1350         int error;
1351         char              dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN];
1352         int datlen = 0;
1353         int bif_dlt;
1354         int bd_hdrcmplt;
1355
1356         lck_mtx_lock(bpf_mlock);
1357
1358         d = bpf_dtab[minor(dev)];
1359         if (d == NULL || d == BPF_DEV_RESERVED ||
1360             (d->bd_flags & BPF_CLOSING) != 0) {
1361                 lck_mtx_unlock(bpf_mlock);
1362                 return ENXIO;
1363         }
1364
1365         bpf_acquire_d(d);
1366
1367         if (d->bd_bif == 0) {
1368                 bpf_release_d(d);
1369                 lck_mtx_unlock(bpf_mlock);
1370                 return ENXIO;
1371         }
1372
1373         ifp = d->bd_bif->bif_ifp;
1374
1375         if ((ifp->if_flags & IFF_UP) == 0) {
1376                 bpf_release_d(d);
1377                 lck_mtx_unlock(bpf_mlock);
1378                 return ENETDOWN;
1379         }
1380         if (uio_resid(uio) == 0) {
1381                 bpf_release_d(d);
1382                 lck_mtx_unlock(bpf_mlock);
1383                 return 0;
1384         }
1385         ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf);
1386
1387         /*
1388          * fix for PR-6849527
1389          * geting variables onto stack before dropping lock for bpf_movein()
1390          */
1391         bif_dlt = (int)d->bd_bif->bif_dlt;
1392         bd_hdrcmplt  = d->bd_hdrcmplt;
1393
1394         /* bpf_movein allocating mbufs; drop lock */
1395         lck_mtx_unlock(bpf_mlock);
1396
1397         error = bpf_movein(uio, bif_dlt, &m,
1398             bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf,
1399             &datlen);
1400
1401         /* take the lock again */
1402         lck_mtx_lock(bpf_mlock);
1403         if (error) {
1404                 bpf_release_d(d);
1405                 lck_mtx_unlock(bpf_mlock);
1406                 return error;
1407         }
1408
1409         /* verify the device is still open */
1410         if ((d->bd_flags & BPF_CLOSING) != 0) {
1411                 bpf_release_d(d);
1412                 lck_mtx_unlock(bpf_mlock);
1413                 m_freem(m);
1414                 return ENXIO;
1415         }
1416
1417         if (d->bd_bif == NULL) {
1418                 bpf_release_d(d);
1419                 lck_mtx_unlock(bpf_mlock);
1420                 m_free(m);
1421                 return ENXIO;
1422         }
1423
1424         if ((unsigned)datlen > ifp->if_mtu) {
1425                 bpf_release_d(d);
1426                 lck_mtx_unlock(bpf_mlock);
1427                 m_freem(m);
1428                 return EMSGSIZE;
1429         }
1430
1431 #if CONFIG_MACF_NET
1432         mac_mbuf_label_associate_bpfdesc(d, m);
1433 #endif
1434
1435         bpf_set_packet_service_class(m, d->bd_traffic_class);
1436
1437         lck_mtx_unlock(bpf_mlock);
1438
1439         /*
1440          * The driver frees the mbuf.
1441          */
1442         if (d->bd_hdrcmplt) {
1443                 if (d->bd_bif->bif_send) {
1444                         error = d->bd_bif->bif_send(ifp, d->bd_bif->bif_dlt, m);
1445                 } else {
1446                         error = dlil_output(ifp, 0, m, NULL, NULL, 1, NULL);
1447                 }
1448         } else {
1449                 error = dlil_output(ifp, PF_INET, m, NULL,
1450                     (struct sockaddr *)dst_buf, 0, NULL);
1451         }
1452
1453         lck_mtx_lock(bpf_mlock);
1454         bpf_release_d(d);
1455         lck_mtx_unlock(bpf_mlock);
1456
1457         return error;
1458 }
1459
1460 /*
1461  * Reset a descriptor by flushing its packet buffer and clearing the
1462  * receive and drop counts.
1463  */
1464 static void
1465 reset_d(struct bpf_d *d)
1466 {
1467         if (d->bd_hbuf_read != 0) {
1468                 panic("resetting buffers during read");
1469         }
1470
1471         if (d->bd_hbuf) {
1472                 /* Free the hold buffer. */
1473                 d->bd_fbuf = d->bd_hbuf;
1474                 d->bd_hbuf = NULL;
1475         }
1476         d->bd_slen = 0;
1477         d->bd_hlen = 0;
1478         d->bd_scnt = 0;
1479         d->bd_hcnt = 0;
1480         d->bd_rcount = 0;
1481         d->bd_dcount = 0;
1482 }
1483
1484 static struct bpf_d *
1485 bpf_get_device_from_uuid(uuid_t uuid)
1486 {
1487         unsigned int i;
1488
1489         for (i = 0; i < nbpfilter; i++) {
1490                 struct bpf_d *d = bpf_dtab[i];
1491
1492                 if (d == NULL || d == BPF_DEV_RESERVED ||
1493                     (d->bd_flags & BPF_CLOSING) != 0) {
1494                         continue;
1495                 }
1496                 if (uuid_compare(uuid, d->bd_uuid) == 0) {
1497                         return d;
1498                 }
1499         }
1500
1501         return NULL;
1502 }
1503
1504 /*
1505  * The BIOCSETUP command "atomically" attach to the interface and
1506  * copy the buffer from another interface. This minimizes the risk
1507  * of missing packet because this is done while holding
1508  * the BPF global lock
1509  */
1510 static int
1511 bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp)
1512 {
1513         struct bpf_d *d_from;
1514         int error = 0;
1515
1516         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
1517
1518         /*
1519          * Sanity checks
1520          */
1521         d_from = bpf_get_device_from_uuid(uuid_from);
1522         if (d_from == NULL) {
1523                 error = ENOENT;
1524                 os_log_info(OS_LOG_DEFAULT,
1525                     "%s: uuids not found error %d",
1526                     __func__, error);
1527                 return error;
1528         }
1529         if (d_from->bd_opened_by != d_to->bd_opened_by) {
1530                 error = EACCES;
1531                 os_log_info(OS_LOG_DEFAULT,
1532                     "%s: processes not matching error %d",
1533                     __func__, error);
1534                 return error;
1535         }
1536
1537         /*
1538          * Prevent any read while copying
1539          */
1540         while (d_to->bd_hbuf_read != 0) {
1541                 msleep((caddr_t)d_to, bpf_mlock, PRINET, __func__, NULL);
1542         }
1543         d_to->bd_hbuf_read = 1;
1544
1545         while (d_from->bd_hbuf_read != 0) {
1546                 msleep((caddr_t)d_from, bpf_mlock, PRINET, __func__, NULL);
1547         }
1548         d_from->bd_hbuf_read = 1;
1549
1550         /*
1551          * Verify the devices have not been closed
1552          */
1553         if (d_to->bd_flags & BPF_CLOSING) {
1554                 error = ENXIO;
1555                 os_log_info(OS_LOG_DEFAULT,
1556                     "%s: d_to is closing error %d",
1557                     __func__, error);
1558                 goto done;
1559         }
1560         if (d_from->bd_flags & BPF_CLOSING) {
1561                 error = ENXIO;
1562                 os_log_info(OS_LOG_DEFAULT,
1563                     "%s: d_from is closing error %d",
1564                     __func__, error);
1565                 goto done;
1566         }
1567
1568         /*
1569          * For now require the same buffer size
1570          */
1571         if (d_from->bd_bufsize != d_to->bd_bufsize) {
1572                 error = EINVAL;
1573                 os_log_info(OS_LOG_DEFAULT,
1574                     "%s: bufsizes not matching error %d",
1575                     __func__, error);
1576                 goto done;
1577         }
1578
1579         /*
1580          * Attach to the interface
1581          */
1582         error = bpf_setif(d_to, ifp, false, true);
1583         if (error != 0) {
1584                 os_log_info(OS_LOG_DEFAULT,
1585                     "%s: bpf_setif() failed error %d",
1586                     __func__, error);
1587                 goto done;
1588         }
1589
1590         /*
1591          * Make sure the buffers are setup as expected by bpf_setif()
1592          */
1593         ASSERT(d_to->bd_hbuf == NULL);
1594         ASSERT(d_to->bd_sbuf != NULL);
1595         ASSERT(d_to->bd_fbuf != NULL);
1596
1597         /*
1598          * Copy the buffers and update the pointers and counts
1599          */
1600         memcpy(d_to->bd_sbuf, d_from->bd_sbuf, d_from->bd_slen);
1601         d_to->bd_slen = d_from->bd_slen;
1602         d_to->bd_scnt = d_from->bd_scnt;
1603
1604         if (d_from->bd_hbuf != NULL) {
1605                 d_to->bd_hbuf = d_to->bd_fbuf;
1606                 d_to->bd_fbuf = NULL;
1607                 memcpy(d_to->bd_hbuf, d_from->bd_hbuf, d_from->bd_hlen);
1608         }
1609         d_to->bd_hlen = d_from->bd_hlen;
1610         d_to->bd_hcnt = d_from->bd_hcnt;
1611
1612         if (bpf_debug > 0) {
1613                 os_log_info(OS_LOG_DEFAULT,
1614                     "%s: done slen %u scnt %u hlen %u hcnt %u",
1615                     __func__, d_to->bd_slen, d_to->bd_scnt,
1616                     d_to->bd_hlen, d_to->bd_hcnt);
1617         }
1618 done:
1619         d_from->bd_hbuf_read = 0;
1620         wakeup((caddr_t)d_from);
1621
1622         d_to->bd_hbuf_read = 0;
1623         wakeup((caddr_t)d_to);
1624
1625         return error;
1626 }
1627
1628 /*
1629  *  FIONREAD            Check for read packet available.
1630  *  SIOCGIFADDR         Get interface address - convenient hook to driver.
1631  *  BIOCGBLEN           Get buffer len [for read()].
1632  *  BIOCSETF            Set ethernet read filter.
1633  *  BIOCFLUSH           Flush read packet buffer.
1634  *  BIOCPROMISC         Put interface into promiscuous mode.
1635  *  BIOCGDLT            Get link layer type.
1636  *  BIOCGETIF           Get interface name.
1637  *  BIOCSETIF           Set interface.
1638  *  BIOCSRTIMEOUT       Set read timeout.
1639  *  BIOCGRTIMEOUT       Get read timeout.
1640  *  BIOCGSTATS          Get packet stats.
1641  *  BIOCIMMEDIATE       Set immediate mode.
1642  *  BIOCVERSION         Get filter language version.
1643  *  BIOCGHDRCMPLT       Get "header already complete" flag
1644  *  BIOCSHDRCMPLT       Set "header already complete" flag
1645  *  BIOCGSEESENT        Get "see packets sent" flag
1646  *  BIOCSSEESENT        Set "see packets sent" flag
1647  *  BIOCSETTC           Set traffic class.
1648  *  BIOCGETTC           Get traffic class.
1649  *  BIOCSEXTHDR         Set "extended header" flag
1650  *  BIOCSHEADDROP       Drop head of the buffer if user is not reading
1651  *  BIOCGHEADDROP       Get "head-drop" flag
1652  */
1653 /* ARGSUSED */
1654 int
1655 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
1656     struct proc *p)
1657 {
1658         struct bpf_d *d;
1659         int error = 0;
1660         u_int int_arg;
1661         struct ifreq ifr;
1662
1663         lck_mtx_lock(bpf_mlock);
1664
1665         d = bpf_dtab[minor(dev)];
1666         if (d == NULL || d == BPF_DEV_RESERVED ||
1667             (d->bd_flags & BPF_CLOSING) != 0) {
1668                 lck_mtx_unlock(bpf_mlock);
1669                 return ENXIO;
1670         }
1671
1672         bpf_acquire_d(d);
1673
1674         if (d->bd_state == BPF_WAITING) {
1675                 bpf_stop_timer(d);
1676         }
1677         d->bd_state = BPF_IDLE;
1678
1679         switch (cmd) {
1680         default:
1681                 error = EINVAL;
1682                 break;
1683
1684         /*
1685          * Check for read packet available.
1686          */
1687         case FIONREAD:                  /* int */
1688         {
1689                 int n;
1690
1691                 n = d->bd_slen;
1692                 if (d->bd_hbuf && d->bd_hbuf_read == 0) {
1693                         n += d->bd_hlen;
1694                 }
1695
1696                 bcopy(&n, addr, sizeof(n));
1697                 break;
1698         }
1699
1700         case SIOCGIFADDR:               /* struct ifreq */
1701         {
1702                 struct ifnet *ifp;
1703
1704                 if (d->bd_bif == 0) {
1705                         error = EINVAL;
1706                 } else {
1707                         ifp = d->bd_bif->bif_ifp;
1708                         error = ifnet_ioctl(ifp, 0, cmd, addr);
1709                 }
1710                 break;
1711         }
1712
1713         /*
1714          * Get buffer len [for read()].
1715          */
1716         case BIOCGBLEN:                 /* u_int */
1717                 bcopy(&d->bd_bufsize, addr, sizeof(u_int));
1718                 break;
1719
1720         /*
1721          * Set buffer length.
1722          */
1723         case BIOCSBLEN: {               /* u_int */
1724                 u_int size;
1725                 unsigned int maxbufsize = bpf_maxbufsize;
1726
1727                 /*
1728                  * Allow larger buffer in head drop mode to with the
1729                  * assumption the reading process may be low priority but
1730                  * is interested in the most recent traffic
1731                  */
1732                 if (d->bd_headdrop != 0) {
1733                         maxbufsize = 2 * bpf_maxbufsize;
1734                 }
1735
1736                 if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING)) {
1737                         /*
1738                          * Interface already attached, unable to change buffers
1739                          */
1740                         error = EINVAL;
1741                         break;
1742                 }
1743                 bcopy(addr, &size, sizeof(size));
1744
1745                 if (size > maxbufsize) {
1746                         d->bd_bufsize = maxbufsize;
1747
1748                         os_log_info(OS_LOG_DEFAULT,
1749                             "%s bufsize capped to %u from %u",
1750                             __func__, d->bd_bufsize, size);
1751                 } else if (size < BPF_MINBUFSIZE) {
1752                         d->bd_bufsize = BPF_MINBUFSIZE;
1753
1754                         os_log_info(OS_LOG_DEFAULT,
1755                             "%s bufsize bumped to %u from %u",
1756                             __func__, d->bd_bufsize, size);
1757                 } else {
1758                         d->bd_bufsize = size;
1759                 }
1760
1761                 /* It's a read/write ioctl */
1762                 bcopy(&d->bd_bufsize, addr, sizeof(u_int));
1763                 break;
1764         }
1765         /*
1766          * Set link layer read filter.
1767          */
1768         case BIOCSETF32:
1769         case BIOCSETFNR32: {            /* struct bpf_program32 */
1770                 struct bpf_program32 prg32;
1771
1772                 bcopy(addr, &prg32, sizeof(prg32));
1773                 error = bpf_setf(d, prg32.bf_len,
1774                     CAST_USER_ADDR_T(prg32.bf_insns), cmd);
1775                 break;
1776         }
1777
1778         case BIOCSETF64:
1779         case BIOCSETFNR64: {            /* struct bpf_program64 */
1780                 struct bpf_program64 prg64;
1781
1782                 bcopy(addr, &prg64, sizeof(prg64));
1783                 error = bpf_setf(d, prg64.bf_len, prg64.bf_insns, cmd);
1784                 break;
1785         }
1786
1787         /*
1788          * Flush read packet buffer.
1789          */
1790         case BIOCFLUSH:
1791                 while (d->bd_hbuf_read != 0) {
1792                         msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
1793                             NULL);
1794                 }
1795                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1796                         error = ENXIO;
1797                         break;
1798                 }
1799                 reset_d(d);
1800                 break;
1801
1802         /*
1803          * Put interface into promiscuous mode.
1804          */
1805         case BIOCPROMISC:
1806                 if (d->bd_bif == 0) {
1807                         /*
1808                          * No interface attached yet.
1809                          */
1810                         error = EINVAL;
1811                         break;
1812                 }
1813                 if (d->bd_promisc == 0) {
1814                         lck_mtx_unlock(bpf_mlock);
1815                         error = ifnet_set_promiscuous(d->bd_bif->bif_ifp, 1);
1816                         lck_mtx_lock(bpf_mlock);
1817                         if (error == 0) {
1818                                 d->bd_promisc = 1;
1819                         }
1820                 }
1821                 break;
1822
1823         /*
1824          * Get device parameters.
1825          */
1826         case BIOCGDLT:                  /* u_int */
1827                 if (d->bd_bif == 0) {
1828                         error = EINVAL;
1829                 } else {
1830                         bcopy(&d->bd_bif->bif_dlt, addr, sizeof(u_int));
1831                 }
1832                 break;
1833
1834         /*
1835          * Get a list of supported data link types.
1836          */
1837         case BIOCGDLTLIST:              /* struct bpf_dltlist */
1838                 if (d->bd_bif == NULL) {
1839                         error = EINVAL;
1840                 } else {
1841                         error = bpf_getdltlist(d, addr, p);
1842                 }
1843                 break;
1844
1845         /*
1846          * Set data link type.
1847          */
1848         case BIOCSDLT:                  /* u_int */
1849                 if (d->bd_bif == NULL) {
1850                         error = EINVAL;
1851                 } else {
1852                         u_int dlt;
1853
1854                         bcopy(addr, &dlt, sizeof(dlt));
1855
1856                         if (dlt == DLT_PKTAP &&
1857                             !(d->bd_flags & BPF_WANT_PKTAP)) {
1858                                 dlt = DLT_RAW;
1859                         }
1860                         error = bpf_setdlt(d, dlt);
1861                 }
1862                 break;
1863
1864         /*
1865          * Get interface name.
1866          */
1867         case BIOCGETIF:                 /* struct ifreq */
1868                 if (d->bd_bif == 0) {
1869                         error = EINVAL;
1870                 } else {
1871                         struct ifnet *const ifp = d->bd_bif->bif_ifp;
1872
1873                         snprintf(((struct ifreq *)(void *)addr)->ifr_name,
1874                             sizeof(ifr.ifr_name), "%s", if_name(ifp));
1875                 }
1876                 break;
1877
1878         /*
1879          * Set interface.
1880          */
1881         case BIOCSETIF: {               /* struct ifreq */
1882                 ifnet_t ifp;
1883
1884                 bcopy(addr, &ifr, sizeof(ifr));
1885                 ifr.ifr_name[IFNAMSIZ - 1] = '\0';
1886                 ifp = ifunit(ifr.ifr_name);
1887                 if (ifp == NULL) {
1888                         error = ENXIO;
1889                 } else {
1890                         error = bpf_setif(d, ifp, true, false);
1891                 }
1892                 break;
1893         }
1894
1895         /*
1896          * Set read timeout.
1897          */
1898         case BIOCSRTIMEOUT32: {         /* struct user32_timeval */
1899                 struct user32_timeval _tv;
1900                 struct timeval tv;
1901
1902                 bcopy(addr, &_tv, sizeof(_tv));
1903                 tv.tv_sec  = _tv.tv_sec;
1904                 tv.tv_usec = _tv.tv_usec;
1905
1906                 /*
1907                  * Subtract 1 tick from tvtohz() since this isn't
1908                  * a one-shot timer.
1909                  */
1910                 if ((error = itimerfix(&tv)) == 0) {
1911                         d->bd_rtout = tvtohz(&tv) - 1;
1912                 }
1913                 break;
1914         }
1915
1916         case BIOCSRTIMEOUT64: {         /* struct user64_timeval */
1917                 struct user64_timeval _tv;
1918                 struct timeval tv;
1919
1920                 bcopy(addr, &_tv, sizeof(_tv));
1921                 tv.tv_sec  = _tv.tv_sec;
1922                 tv.tv_usec = _tv.tv_usec;
1923
1924                 /*
1925                  * Subtract 1 tick from tvtohz() since this isn't
1926                  * a one-shot timer.
1927                  */
1928                 if ((error = itimerfix(&tv)) == 0) {
1929                         d->bd_rtout = tvtohz(&tv) - 1;
1930                 }
1931                 break;
1932         }
1933
1934         /*
1935          * Get read timeout.
1936          */
1937         case BIOCGRTIMEOUT32: {         /* struct user32_timeval */
1938                 struct user32_timeval tv;
1939
1940                 bzero(&tv, sizeof(tv));
1941                 tv.tv_sec = d->bd_rtout / hz;
1942                 tv.tv_usec = (d->bd_rtout % hz) * tick;
1943                 bcopy(&tv, addr, sizeof(tv));
1944                 break;
1945         }
1946
1947         case BIOCGRTIMEOUT64: {         /* struct user64_timeval */
1948                 struct user64_timeval tv;
1949
1950                 bzero(&tv, sizeof(tv));
1951                 tv.tv_sec = d->bd_rtout / hz;
1952                 tv.tv_usec = (d->bd_rtout % hz) * tick;
1953                 bcopy(&tv, addr, sizeof(tv));
1954                 break;
1955         }
1956
1957         /*
1958          * Get packet stats.
1959          */
1960         case BIOCGSTATS: {              /* struct bpf_stat */
1961                 struct bpf_stat bs;
1962
1963                 bzero(&bs, sizeof(bs));
1964                 bs.bs_recv = d->bd_rcount;
1965                 bs.bs_drop = d->bd_dcount;
1966                 bcopy(&bs, addr, sizeof(bs));
1967                 break;
1968         }
1969
1970         /*
1971          * Set immediate mode.
1972          */
1973         case BIOCIMMEDIATE:             /* u_int */
1974                 d->bd_immediate = *(u_int *)(void *)addr;
1975                 break;
1976
1977         case BIOCVERSION: {             /* struct bpf_version */
1978                 struct bpf_version bv;
1979
1980                 bzero(&bv, sizeof(bv));
1981                 bv.bv_major = BPF_MAJOR_VERSION;
1982                 bv.bv_minor = BPF_MINOR_VERSION;
1983                 bcopy(&bv, addr, sizeof(bv));
1984                 break;
1985         }
1986
1987         /*
1988          * Get "header already complete" flag
1989          */
1990         case BIOCGHDRCMPLT:             /* u_int */
1991                 bcopy(&d->bd_hdrcmplt, addr, sizeof(u_int));
1992                 break;
1993
1994         /*
1995          * Set "header already complete" flag
1996          */
1997         case BIOCSHDRCMPLT:             /* u_int */
1998                 bcopy(addr, &int_arg, sizeof(int_arg));
1999                 d->bd_hdrcmplt = int_arg ? 1 : 0;
2000                 break;
2001
2002         /*
2003          * Get "see sent packets" flag
2004          */
2005         case BIOCGSEESENT:              /* u_int */
2006                 bcopy(&d->bd_seesent, addr, sizeof(u_int));
2007                 break;
2008
2009         /*
2010          * Set "see sent packets" flag
2011          */
2012         case BIOCSSEESENT:              /* u_int */
2013                 bcopy(addr, &d->bd_seesent, sizeof(u_int));
2014                 break;
2015
2016         /*
2017          * Set traffic service class
2018          */
2019         case BIOCSETTC: {               /* int */
2020                 int tc;
2021
2022                 bcopy(addr, &tc, sizeof(int));
2023                 error = bpf_set_traffic_class(d, tc);
2024                 break;
2025         }
2026
2027         /*
2028          * Get traffic service class
2029          */
2030         case BIOCGETTC:                 /* int */
2031                 bcopy(&d->bd_traffic_class, addr, sizeof(int));
2032                 break;
2033
2034         case FIONBIO:           /* Non-blocking I/O; int */
2035                 break;
2036
2037         case FIOASYNC:          /* Send signal on receive packets; int */
2038                 bcopy(addr, &d->bd_async, sizeof(int));
2039                 break;
2040 #ifndef __APPLE__
2041         case FIOSETOWN:
2042                 error = fsetown(*(int *)addr, &d->bd_sigio);
2043                 break;
2044
2045         case FIOGETOWN:
2046                 *(int *)addr = fgetown(d->bd_sigio);
2047                 break;
2048
2049         /* This is deprecated, FIOSETOWN should be used instead. */
2050         case TIOCSPGRP:
2051                 error = fsetown(-(*(int *)addr), &d->bd_sigio);
2052                 break;
2053
2054         /* This is deprecated, FIOGETOWN should be used instead. */
2055         case TIOCGPGRP:
2056                 *(int *)addr = -fgetown(d->bd_sigio);
2057                 break;
2058 #endif
2059         case BIOCSRSIG: {       /* Set receive signal; u_int */
2060                 u_int sig;
2061
2062                 bcopy(addr, &sig, sizeof(u_int));
2063
2064                 if (sig >= NSIG) {
2065                         error = EINVAL;
2066                 } else {
2067                         d->bd_sig = sig;
2068                 }
2069                 break;
2070         }
2071         case BIOCGRSIG:                 /* u_int */
2072                 bcopy(&d->bd_sig, addr, sizeof(u_int));
2073                 break;
2074 #ifdef __APPLE__
2075         case BIOCSEXTHDR:               /* u_int */
2076                 bcopy(addr, &int_arg, sizeof(int_arg));
2077                 if (int_arg) {
2078                         d->bd_flags |= BPF_EXTENDED_HDR;
2079                 } else {
2080                         d->bd_flags &= ~BPF_EXTENDED_HDR;
2081                 }
2082                 break;
2083
2084         case BIOCGIFATTACHCOUNT: {              /* struct ifreq */
2085                 ifnet_t ifp;
2086                 struct bpf_if *bp;
2087
2088                 bcopy(addr, &ifr, sizeof(ifr));
2089                 ifr.ifr_name[IFNAMSIZ - 1] = '\0';
2090                 ifp = ifunit(ifr.ifr_name);
2091                 if (ifp == NULL) {
2092                         error = ENXIO;
2093                         break;
2094                 }
2095                 ifr.ifr_intval = 0;
2096                 for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) {
2097                         struct bpf_d *bpf_d;
2098
2099                         if (bp->bif_ifp == NULL || bp->bif_ifp != ifp) {
2100                                 continue;
2101                         }
2102                         for (bpf_d = bp->bif_dlist; bpf_d;
2103                             bpf_d = bpf_d->bd_next) {
2104                                 ifr.ifr_intval += 1;
2105                         }
2106                 }
2107                 bcopy(&ifr, addr, sizeof(ifr));
2108                 break;
2109         }
2110         case BIOCGWANTPKTAP:                    /* u_int */
2111                 int_arg = d->bd_flags & BPF_WANT_PKTAP ? 1 : 0;
2112                 bcopy(&int_arg, addr, sizeof(int_arg));
2113                 break;
2114
2115         case BIOCSWANTPKTAP:                    /* u_int */
2116                 bcopy(addr, &int_arg, sizeof(int_arg));
2117                 if (int_arg) {
2118                         d->bd_flags |= BPF_WANT_PKTAP;
2119                 } else {
2120                         d->bd_flags &= ~BPF_WANT_PKTAP;
2121                 }
2122                 break;
2123 #endif
2124
2125         case BIOCSHEADDROP:
2126                 bcopy(addr, &int_arg, sizeof(int_arg));
2127                 d->bd_headdrop = int_arg ? 1 : 0;
2128                 break;
2129
2130         case BIOCGHEADDROP:
2131                 bcopy(&d->bd_headdrop, addr, sizeof(int));
2132                 break;
2133
2134         case BIOCSTRUNCATE:
2135                 bcopy(addr, &int_arg, sizeof(int_arg));
2136                 if (int_arg) {
2137                         d->bd_flags |=  BPF_TRUNCATE;
2138                 } else {
2139                         d->bd_flags &= ~BPF_TRUNCATE;
2140                 }
2141                 break;
2142
2143         case BIOCGETUUID:
2144                 bcopy(&d->bd_uuid, addr, sizeof(uuid_t));
2145                 break;
2146
2147         case BIOCSETUP: {
2148                 struct bpf_setup_args bsa;
2149                 ifnet_t ifp;
2150
2151                 bcopy(addr, &bsa, sizeof(struct bpf_setup_args));
2152                 bsa.bsa_ifname[IFNAMSIZ - 1] = 0;
2153                 ifp = ifunit(bsa.bsa_ifname);
2154                 if (ifp == NULL) {
2155                         error = ENXIO;
2156                         os_log_info(OS_LOG_DEFAULT,
2157                             "%s: ifnet not found for %s error %d",
2158                             __func__, bsa.bsa_ifname, error);
2159                         break;
2160                 }
2161
2162                 error = bpf_setup(d, bsa.bsa_uuid, ifp);
2163                 break;
2164         }
2165         case BIOCSPKTHDRV2:
2166                 bcopy(addr, &int_arg, sizeof(int_arg));
2167                 if (int_arg != 0) {
2168                         d->bd_flags |= BPF_PKTHDRV2;
2169                 } else {
2170                         d->bd_flags &= ~BPF_PKTHDRV2;
2171                 }
2172                 break;
2173
2174         case BIOCGPKTHDRV2:
2175                 int_arg = d->bd_flags & BPF_PKTHDRV2 ? 1 : 0;
2176                 bcopy(&int_arg, addr, sizeof(int));
2177                 break;
2178         }
2179
2180         bpf_release_d(d);
2181         lck_mtx_unlock(bpf_mlock);
2182
2183         return error;
2184 }
2185
2186 /*
2187  * Set d's packet filter program to fp.  If this file already has a filter,
2188  * free it and replace it.  Returns EINVAL for bogus requests.
2189  */
2190 static int
2191 bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns,
2192     u_long cmd)
2193 {
2194         struct bpf_insn *fcode, *old;
2195         u_int flen, size;
2196
2197         while (d->bd_hbuf_read != 0) {
2198                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2199         }
2200
2201         if ((d->bd_flags & BPF_CLOSING) != 0) {
2202                 return ENXIO;
2203         }
2204
2205         old = d->bd_filter;
2206         if (bf_insns == USER_ADDR_NULL) {
2207                 if (bf_len != 0) {
2208                         return EINVAL;
2209                 }
2210                 d->bd_filter = NULL;
2211                 reset_d(d);
2212                 if (old != 0) {
2213                         FREE(old, M_DEVBUF);
2214                 }
2215                 return 0;
2216         }
2217         flen = bf_len;
2218         if (flen > BPF_MAXINSNS) {
2219                 return EINVAL;
2220         }
2221
2222         size = flen * sizeof(struct bpf_insn);
2223         fcode = (struct bpf_insn *) _MALLOC(size, M_DEVBUF, M_WAIT);
2224 #ifdef __APPLE__
2225         if (fcode == NULL) {
2226                 return ENOBUFS;
2227         }
2228 #endif
2229         if (copyin(bf_insns, (caddr_t)fcode, size) == 0 &&
2230             bpf_validate(fcode, (int)flen)) {
2231                 d->bd_filter = fcode;
2232
2233                 if (cmd == BIOCSETF32 || cmd == BIOCSETF64) {
2234                         reset_d(d);
2235                 }
2236
2237                 if (old != 0) {
2238                         FREE(old, M_DEVBUF);
2239                 }
2240
2241                 return 0;
2242         }
2243         FREE(fcode, M_DEVBUF);
2244         return EINVAL;
2245 }
2246
2247 /*
2248  * Detach a file from its current interface (if attached at all) and attach
2249  * to the interface indicated by the name stored in ifr.
2250  * Return an errno or 0.
2251  */
2252 static int
2253 bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read)
2254 {
2255         struct bpf_if *bp;
2256         int error;
2257
2258         while (d->bd_hbuf_read != 0 && !has_hbuf_read) {
2259                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2260         }
2261
2262         if ((d->bd_flags & BPF_CLOSING) != 0) {
2263                 return ENXIO;
2264         }
2265
2266         /*
2267          * Look through attached interfaces for the named one.
2268          */
2269         for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) {
2270                 struct ifnet *ifp = bp->bif_ifp;
2271
2272                 if (ifp == 0 || ifp != theywant) {
2273                         continue;
2274                 }
2275                 /*
2276                  * Do not use DLT_PKTAP, unless requested explicitly
2277                  */
2278                 if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
2279                         continue;
2280                 }
2281                 /*
2282                  * Skip the coprocessor interface
2283                  */
2284                 if (!intcoproc_unrestricted && IFNET_IS_INTCOPROC(ifp)) {
2285                         continue;
2286                 }
2287                 /*
2288                  * We found the requested interface.
2289                  * Allocate the packet buffers.
2290                  */
2291                 error = bpf_allocbufs(d);
2292                 if (error != 0) {
2293                         return error;
2294                 }
2295                 /*
2296                  * Detach if attached to something else.
2297                  */
2298                 if (bp != d->bd_bif) {
2299                         if (d->bd_bif != NULL) {
2300                                 if (bpf_detachd(d, 0) != 0) {
2301                                         return ENXIO;
2302                                 }
2303                         }
2304                         if (bpf_attachd(d, bp) != 0) {
2305                                 return ENXIO;
2306                         }
2307                 }
2308                 if (do_reset) {
2309                         reset_d(d);
2310                 }
2311                 return 0;
2312         }
2313         /* Not found. */
2314         return ENXIO;
2315 }
2316
2317 /*
2318  * Get a list of available data link type of the interface.
2319  */
2320 static int
2321 bpf_getdltlist(struct bpf_d *d, caddr_t addr, struct proc *p)
2322 {
2323         u_int           n;
2324         int             error;
2325         struct ifnet    *ifp;
2326         struct bpf_if   *bp;
2327         user_addr_t     dlist;
2328         struct bpf_dltlist bfl;
2329
2330         bcopy(addr, &bfl, sizeof(bfl));
2331         if (proc_is64bit(p)) {
2332                 dlist = (user_addr_t)bfl.bfl_u.bflu_pad;
2333         } else {
2334                 dlist = CAST_USER_ADDR_T(bfl.bfl_u.bflu_list);
2335         }
2336
2337         ifp = d->bd_bif->bif_ifp;
2338         n = 0;
2339         error = 0;
2340
2341         for (bp = bpf_iflist; bp; bp = bp->bif_next) {
2342                 if (bp->bif_ifp != ifp) {
2343                         continue;
2344                 }
2345                 /*
2346                  * Do not use DLT_PKTAP, unless requested explicitly
2347                  */
2348                 if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
2349                         continue;
2350                 }
2351                 if (dlist != USER_ADDR_NULL) {
2352                         if (n >= bfl.bfl_len) {
2353                                 return ENOMEM;
2354                         }
2355                         error = copyout(&bp->bif_dlt, dlist,
2356                             sizeof(bp->bif_dlt));
2357                         if (error != 0) {
2358                                 break;
2359                         }
2360                         dlist += sizeof(bp->bif_dlt);
2361                 }
2362                 n++;
2363         }
2364         bfl.bfl_len = n;
2365         bcopy(&bfl, addr, sizeof(bfl));
2366
2367         return error;
2368 }
2369
2370 /*
2371  * Set the data link type of a BPF instance.
2372  */
2373 static int
2374 bpf_setdlt(struct bpf_d *d, uint32_t dlt)
2375 {
2376         int error, opromisc;
2377         struct ifnet *ifp;
2378         struct bpf_if *bp;
2379
2380         if (d->bd_bif->bif_dlt == dlt) {
2381                 return 0;
2382         }
2383
2384         while (d->bd_hbuf_read != 0) {
2385                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2386         }
2387
2388         if ((d->bd_flags & BPF_CLOSING) != 0) {
2389                 return ENXIO;
2390         }
2391
2392         ifp = d->bd_bif->bif_ifp;
2393         for (bp = bpf_iflist; bp; bp = bp->bif_next) {
2394                 if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) {
2395                         /*
2396                          * Do not use DLT_PKTAP, unless requested explicitly
2397                          */
2398                         if (bp->bif_dlt == DLT_PKTAP &&
2399                             !(d->bd_flags & BPF_WANT_PKTAP)) {
2400                                 continue;
2401                         }
2402                         break;
2403                 }
2404         }
2405         if (bp != NULL) {
2406                 opromisc = d->bd_promisc;
2407                 if (bpf_detachd(d, 0) != 0) {
2408                         return ENXIO;
2409                 }
2410                 error = bpf_attachd(d, bp);
2411                 if (error) {
2412                         printf("bpf_setdlt: bpf_attachd %s%d failed (%d)\n",
2413                             ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp),
2414                             error);
2415                         return error;
2416                 }
2417                 reset_d(d);
2418                 if (opromisc) {
2419                         lck_mtx_unlock(bpf_mlock);
2420                         error = ifnet_set_promiscuous(bp->bif_ifp, 1);
2421                         lck_mtx_lock(bpf_mlock);
2422                         if (error) {
2423                                 printf("%s: ifpromisc %s%d failed (%d)\n",
2424                                     __func__, ifnet_name(bp->bif_ifp),
2425                                     ifnet_unit(bp->bif_ifp), error);
2426                         } else {
2427                                 d->bd_promisc = 1;
2428                         }
2429                 }
2430         }
2431         return bp == NULL ? EINVAL : 0;
2432 }
2433
2434 static int
2435 bpf_set_traffic_class(struct bpf_d *d, int tc)
2436 {
2437         int error = 0;
2438
2439         if (!SO_VALID_TC(tc)) {
2440                 error = EINVAL;
2441         } else {
2442                 d->bd_traffic_class = tc;
2443         }
2444
2445         return error;
2446 }
2447
2448 static void
2449 bpf_set_packet_service_class(struct mbuf *m, int tc)
2450 {
2451         if (!(m->m_flags & M_PKTHDR)) {
2452                 return;
2453         }
2454
2455         VERIFY(SO_VALID_TC(tc));
2456         (void) m_set_service_class(m, so_tc2msc(tc));
2457 }
2458
2459 /*
2460  * Support for select()
2461  *
2462  * Return true iff the specific operation will not block indefinitely.
2463  * Otherwise, return false but make a note that a selwakeup() must be done.
2464  */
2465 int
2466 bpfselect(dev_t dev, int which, void * wql, struct proc *p)
2467 {
2468         struct bpf_d *d;
2469         int ret = 0;
2470
2471         lck_mtx_lock(bpf_mlock);
2472
2473         d = bpf_dtab[minor(dev)];
2474         if (d == NULL || d == BPF_DEV_RESERVED ||
2475             (d->bd_flags & BPF_CLOSING) != 0) {
2476                 lck_mtx_unlock(bpf_mlock);
2477                 return ENXIO;
2478         }
2479
2480         bpf_acquire_d(d);
2481
2482         if (d->bd_bif == NULL) {
2483                 bpf_release_d(d);
2484                 lck_mtx_unlock(bpf_mlock);
2485                 return ENXIO;
2486         }
2487
2488         while (d->bd_hbuf_read != 0) {
2489                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2490         }
2491
2492         if ((d->bd_flags & BPF_CLOSING) != 0) {
2493                 bpf_release_d(d);
2494                 lck_mtx_unlock(bpf_mlock);
2495                 return ENXIO;
2496         }
2497
2498         switch (which) {
2499         case FREAD:
2500                 if (d->bd_hlen != 0 ||
2501                     ((d->bd_immediate ||
2502                     d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0)) {
2503                         ret = 1;         /* read has data to return */
2504                 } else {
2505                         /*
2506                          * Read has no data to return.
2507                          * Make the select wait, and start a timer if
2508                          * necessary.
2509                          */
2510                         selrecord(p, &d->bd_sel, wql);
2511                         bpf_start_timer(d);
2512                 }
2513                 break;
2514
2515         case FWRITE:
2516                 /* can't determine whether a write would block */
2517                 ret = 1;
2518                 break;
2519         }
2520
2521         bpf_release_d(d);
2522         lck_mtx_unlock(bpf_mlock);
2523
2524         return ret;
2525 }
2526
2527 /*
2528  * Support for kevent() system call.  Register EVFILT_READ filters and
2529  * reject all others.
2530  */
2531 int bpfkqfilter(dev_t dev, struct knote *kn);
2532 static void filt_bpfdetach(struct knote *);
2533 static int filt_bpfread(struct knote *, long);
2534 static int filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev);
2535 static int filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev);
2536
2537 SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = {
2538         .f_isfd = 1,
2539         .f_detach = filt_bpfdetach,
2540         .f_event = filt_bpfread,
2541         .f_touch = filt_bpftouch,
2542         .f_process = filt_bpfprocess,
2543 };
2544
2545 static int
2546 filt_bpfread_common(struct knote *kn, struct kevent_qos_s *kev, struct bpf_d *d)
2547 {
2548         int ready = 0;
2549         int64_t data = 0;
2550
2551         if (d->bd_immediate) {
2552                 /*
2553                  * If there's data in the hold buffer, it's the
2554                  * amount of data a read will return.
2555                  *
2556                  * If there's no data in the hold buffer, but
2557                  * there's data in the store buffer, a read will
2558                  * immediately rotate the store buffer to the
2559                  * hold buffer, the amount of data in the store
2560                  * buffer is the amount of data a read will
2561                  * return.
2562                  *
2563                  * If there's no data in either buffer, we're not
2564                  * ready to read.
2565                  */
2566                 data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ?
2567                     d->bd_slen : d->bd_hlen);
2568                 int64_t lowwat = knote_low_watermark(kn);
2569                 if (lowwat > d->bd_bufsize) {
2570                         lowwat = d->bd_bufsize;
2571                 }
2572                 ready = (data >= lowwat);
2573         } else {
2574                 /*
2575                  * If there's data in the hold buffer, it's the
2576                  * amount of data a read will return.
2577                  *
2578                  * If there's no data in the hold buffer, but
2579                  * there's data in the store buffer, if the
2580                  * timer has expired a read will immediately
2581                  * rotate the store buffer to the hold buffer,
2582                  * so the amount of data in the store buffer is
2583                  * the amount of data a read will return.
2584                  *
2585                  * If there's no data in either buffer, or there's
2586                  * no data in the hold buffer and the timer hasn't
2587                  * expired, we're not ready to read.
2588                  */
2589                 data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) &&
2590                     d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen);
2591                 ready = (data > 0);
2592         }
2593         if (!ready) {
2594                 bpf_start_timer(d);
2595         } else if (kev) {
2596                 knote_fill_kevent(kn, kev, data);
2597         }
2598
2599         return ready;
2600 }
2601
2602 int
2603 bpfkqfilter(dev_t dev, struct knote *kn)
2604 {
2605         struct bpf_d *d;
2606         int res;
2607
2608         /*
2609          * Is this device a bpf?
2610          */
2611         if (major(dev) != CDEV_MAJOR || kn->kn_filter != EVFILT_READ) {
2612                 knote_set_error(kn, EINVAL);
2613                 return 0;
2614         }
2615
2616         lck_mtx_lock(bpf_mlock);
2617
2618         d = bpf_dtab[minor(dev)];
2619
2620         if (d == NULL || d == BPF_DEV_RESERVED ||
2621             (d->bd_flags & BPF_CLOSING) != 0 ||
2622             d->bd_bif == NULL) {
2623                 lck_mtx_unlock(bpf_mlock);
2624                 knote_set_error(kn, ENXIO);
2625                 return 0;
2626         }
2627
2628         kn->kn_hook = d;
2629         kn->kn_filtid = EVFILTID_BPFREAD;
2630         KNOTE_ATTACH(&d->bd_sel.si_note, kn);
2631         d->bd_flags |= BPF_KNOTE;
2632
2633         /* capture the current state */
2634         res = filt_bpfread_common(kn, NULL, d);
2635
2636         lck_mtx_unlock(bpf_mlock);
2637
2638         return res;
2639 }
2640
2641 static void
2642 filt_bpfdetach(struct knote *kn)
2643 {
2644         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2645
2646         lck_mtx_lock(bpf_mlock);
2647         if (d->bd_flags & BPF_KNOTE) {
2648                 KNOTE_DETACH(&d->bd_sel.si_note, kn);
2649                 d->bd_flags &= ~BPF_KNOTE;
2650         }
2651         lck_mtx_unlock(bpf_mlock);
2652 }
2653
2654 static int
2655 filt_bpfread(struct knote *kn, long hint)
2656 {
2657 #pragma unused(hint)
2658         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2659
2660         return filt_bpfread_common(kn, NULL, d);
2661 }
2662
2663 static int
2664 filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev)
2665 {
2666         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2667         int res;
2668
2669         lck_mtx_lock(bpf_mlock);
2670
2671         /* save off the lowat threshold and flag */
2672         kn->kn_sdata = kev->data;
2673         kn->kn_sfflags = kev->fflags;
2674
2675         /* output data will be re-generated here */
2676         res = filt_bpfread_common(kn, NULL, d);
2677
2678         lck_mtx_unlock(bpf_mlock);
2679
2680         return res;
2681 }
2682
2683 static int
2684 filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev)
2685 {
2686         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2687         int res;
2688
2689         lck_mtx_lock(bpf_mlock);
2690         res = filt_bpfread_common(kn, kev, d);
2691         lck_mtx_unlock(bpf_mlock);
2692
2693         return res;
2694 }
2695
2696 /*
2697  * Copy data from an mbuf chain into a buffer.  This code is derived
2698  * from m_copydata in kern/uipc_mbuf.c.
2699  */
2700 static void
2701 bpf_mcopy(struct mbuf * m, void *dst_arg, size_t len)
2702 {
2703         u_int count;
2704         u_char *dst;
2705
2706         dst = dst_arg;
2707         while (len > 0) {
2708                 if (m == 0) {
2709                         panic("bpf_mcopy");
2710                 }
2711                 count = min(m->m_len, len);
2712                 bcopy(mbuf_data(m), dst, count);
2713                 m = m->m_next;
2714                 dst += count;
2715                 len -= count;
2716         }
2717 }
2718
2719 static inline void
2720 bpf_tap_imp(
2721         ifnet_t         ifp,
2722         u_int32_t       dlt,
2723         struct bpf_packet *bpf_pkt,
2724         int             outbound)
2725 {
2726         struct bpf_d    *d;
2727         u_int slen;
2728         struct bpf_if *bp;
2729
2730         /*
2731          * It's possible that we get here after the bpf descriptor has been
2732          * detached from the interface; in such a case we simply return.
2733          * Lock ordering is important since we can be called asynchronously
2734          * (from IOKit) to process an inbound packet; when that happens
2735          * we would have been holding its "gateLock" and will be acquiring
2736          * "bpf_mlock" upon entering this routine.  Due to that, we release
2737          * "bpf_mlock" prior to calling ifnet_set_promiscuous (which will
2738          * acquire "gateLock" in the IOKit), in order to avoid a deadlock
2739          * when a ifnet_set_promiscuous request simultaneously collides with
2740          * an inbound packet being passed into the tap callback.
2741          */
2742         lck_mtx_lock(bpf_mlock);
2743         if (ifp->if_bpf == NULL) {
2744                 lck_mtx_unlock(bpf_mlock);
2745                 return;
2746         }
2747         for (bp = ifp->if_bpf; bp != NULL; bp = bp->bif_next) {
2748                 if (bp->bif_ifp != ifp) {
2749                         /* wrong interface */
2750                         bp = NULL;
2751                         break;
2752                 }
2753                 if (dlt == 0 || bp->bif_dlt == dlt) {
2754                         /* tapping default DLT or DLT matches */
2755                         break;
2756                 }
2757         }
2758         if (bp == NULL) {
2759                 goto done;
2760         }
2761         for (d = bp->bif_dlist; d; d = d->bd_next) {
2762                 struct bpf_packet *bpf_pkt_saved = bpf_pkt;
2763                 struct bpf_packet bpf_pkt_tmp;
2764                 struct pktap_header_buffer bpfp_header_tmp;
2765
2766                 if (outbound && !d->bd_seesent) {
2767                         continue;
2768                 }
2769
2770                 ++d->bd_rcount;
2771                 slen = bpf_filter(d->bd_filter, (u_char *)bpf_pkt,
2772                     bpf_pkt->bpfp_total_length, 0);
2773                 if (bp->bif_ifp->if_type == IFT_PKTAP &&
2774                     bp->bif_dlt == DLT_PKTAP) {
2775                         /*
2776                          * Need to copy the bpf_pkt because the conversion
2777                          * to v2 pktap header modifies the content of the
2778                          * bpfp_header
2779                          */
2780                         if ((d->bd_flags & BPF_PKTHDRV2) &&
2781                             bpf_pkt->bpfp_header_length <= sizeof(bpfp_header_tmp)) {
2782                                 bpf_pkt_tmp = *bpf_pkt;
2783
2784                                 bpf_pkt = &bpf_pkt_tmp;
2785
2786                                 memcpy(&bpfp_header_tmp, bpf_pkt->bpfp_header,
2787                                     bpf_pkt->bpfp_header_length);
2788
2789                                 bpf_pkt->bpfp_header = &bpfp_header_tmp;
2790
2791                                 convert_to_pktap_header_to_v2(bpf_pkt,
2792                                     !!(d->bd_flags & BPF_TRUNCATE));
2793                         }
2794
2795                         if (d->bd_flags & BPF_TRUNCATE) {
2796                                 slen = min(slen,
2797                                     get_pkt_trunc_len((u_char *)bpf_pkt,
2798                                     bpf_pkt->bpfp_total_length));
2799                         }
2800                 }
2801                 if (slen != 0) {
2802 #if CONFIG_MACF_NET
2803                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) != 0) {
2804                                 continue;
2805                         }
2806 #endif
2807                         catchpacket(d, bpf_pkt, slen, outbound);
2808                 }
2809                 bpf_pkt = bpf_pkt_saved;
2810         }
2811
2812 done:
2813         lck_mtx_unlock(bpf_mlock);
2814 }
2815
2816 static inline void
2817 bpf_tap_mbuf(
2818         ifnet_t         ifp,
2819         u_int32_t       dlt,
2820         mbuf_t          m,
2821         void*           hdr,
2822         size_t          hlen,
2823         int             outbound)
2824 {
2825         struct bpf_packet bpf_pkt;
2826         struct mbuf *m0;
2827
2828         if (ifp->if_bpf == NULL) {
2829                 /* quickly check without taking lock */
2830                 return;
2831         }
2832         bpf_pkt.bpfp_type = BPF_PACKET_TYPE_MBUF;
2833         bpf_pkt.bpfp_mbuf = m;
2834         bpf_pkt.bpfp_total_length = 0;
2835         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
2836                 bpf_pkt.bpfp_total_length += m0->m_len;
2837         }
2838         bpf_pkt.bpfp_header = hdr;
2839         if (hdr != NULL) {
2840                 bpf_pkt.bpfp_total_length += hlen;
2841                 bpf_pkt.bpfp_header_length = hlen;
2842         } else {
2843                 bpf_pkt.bpfp_header_length = 0;
2844         }
2845         bpf_tap_imp(ifp, dlt, &bpf_pkt, outbound);
2846 }
2847
2848 void
2849 bpf_tap_out(
2850         ifnet_t         ifp,
2851         u_int32_t       dlt,
2852         mbuf_t          m,
2853         void*           hdr,
2854         size_t          hlen)
2855 {
2856         bpf_tap_mbuf(ifp, dlt, m, hdr, hlen, 1);
2857 }
2858
2859 void
2860 bpf_tap_in(
2861         ifnet_t         ifp,
2862         u_int32_t       dlt,
2863         mbuf_t          m,
2864         void*           hdr,
2865         size_t          hlen)
2866 {
2867         bpf_tap_mbuf(ifp, dlt, m, hdr, hlen, 0);
2868 }
2869
2870 /* Callback registered with Ethernet driver. */
2871 static int
2872 bpf_tap_callback(struct ifnet *ifp, struct mbuf *m)
2873 {
2874         bpf_tap_mbuf(ifp, 0, m, NULL, 0, mbuf_pkthdr_rcvif(m) == NULL);
2875
2876         return 0;
2877 }
2878
2879
2880 static errno_t
2881 bpf_copydata(struct bpf_packet *pkt, size_t off, size_t len, void* out_data)
2882 {
2883         errno_t err = 0;
2884         if (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) {
2885                 err = mbuf_copydata(pkt->bpfp_mbuf, off, len, out_data);
2886         } else {
2887                 err = EINVAL;
2888         }
2889
2890         return err;
2891 }
2892
2893 static void
2894 copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len)
2895 {
2896         /* copy the optional header */
2897         if (pkt->bpfp_header_length != 0) {
2898                 size_t  count = min(len, pkt->bpfp_header_length);
2899                 bcopy(pkt->bpfp_header, dst, count);
2900                 len -= count;
2901                 dst += count;
2902         }
2903         if (len == 0) {
2904                 /* nothing past the header */
2905                 return;
2906         }
2907         /* copy the packet */
2908         switch (pkt->bpfp_type) {
2909         case BPF_PACKET_TYPE_MBUF:
2910                 bpf_mcopy(pkt->bpfp_mbuf, dst, len);
2911                 break;
2912         default:
2913                 break;
2914         }
2915 }
2916
2917 static uint16_t
2918 get_esp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
2919     const uint16_t remaining_caplen)
2920 {
2921         /*
2922          * For some reason tcpdump expects to have one byte beyond the ESP header
2923          */
2924         uint16_t trunc_len = ESP_HDR_SIZE + 1;
2925
2926         if (trunc_len > remaining_caplen) {
2927                 return remaining_caplen;
2928         }
2929
2930         return trunc_len;
2931 }
2932
2933 static uint16_t
2934 get_isakmp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
2935     const uint16_t remaining_caplen)
2936 {
2937         /*
2938          * Include the payload generic header
2939          */
2940         uint16_t trunc_len = ISAKMP_HDR_SIZE;
2941
2942         if (trunc_len > remaining_caplen) {
2943                 return remaining_caplen;
2944         }
2945
2946         return trunc_len;
2947 }
2948
2949 static uint16_t
2950 get_isakmp_natt_trunc_len(struct bpf_packet *pkt, uint16_t off,
2951     const uint16_t remaining_caplen)
2952 {
2953         int err = 0;
2954         uint16_t trunc_len = 0;
2955         char payload[remaining_caplen];
2956
2957         err = bpf_copydata(pkt, off, remaining_caplen, payload);
2958         if (err != 0) {
2959                 return remaining_caplen;
2960         }
2961         /*
2962          * They are three cases:
2963          * - IKE: payload start with 4 bytes header set to zero before ISAKMP header
2964          * - keep alive: 1 byte payload
2965          * - otherwise it's ESP
2966          */
2967         if (remaining_caplen >= 4 &&
2968             payload[0] == 0 && payload[1] == 0 &&
2969             payload[2] == 0 && payload[3] == 0) {
2970                 trunc_len = 4 + get_isakmp_trunc_len(pkt, off + 4, remaining_caplen - 4);
2971         } else if (remaining_caplen == 1) {
2972                 trunc_len = 1;
2973         } else {
2974                 trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
2975         }
2976
2977         if (trunc_len > remaining_caplen) {
2978                 return remaining_caplen;
2979         }
2980
2981         return trunc_len;
2982 }
2983
2984 static uint16_t
2985 get_udp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
2986 {
2987         int err = 0;
2988         uint16_t trunc_len = sizeof(struct udphdr); /* By default no UDP payload */
2989
2990         if (trunc_len >= remaining_caplen) {
2991                 return remaining_caplen;
2992         }
2993
2994         struct udphdr udphdr;
2995         err = bpf_copydata(pkt, off, sizeof(struct udphdr), &udphdr);
2996         if (err != 0) {
2997                 return remaining_caplen;
2998         }
2999
3000         u_short sport, dport;
3001
3002         sport = EXTRACT_SHORT(&udphdr.uh_sport);
3003         dport = EXTRACT_SHORT(&udphdr.uh_dport);
3004
3005         if (dport == PORT_DNS || sport == PORT_DNS) {
3006                 /*
3007                  * Full UDP payload for DNS
3008                  */
3009                 trunc_len = remaining_caplen;
3010         } else if ((sport == PORT_BOOTPS && dport == PORT_BOOTPC) ||
3011             (sport == PORT_BOOTPC && dport == PORT_BOOTPS)) {
3012                 /*
3013                  * Full UDP payload for BOOTP and DHCP
3014                  */
3015                 trunc_len = remaining_caplen;
3016         } else if (dport == PORT_ISAKMP && sport == PORT_ISAKMP) {
3017                 /*
3018                  * Return the ISAKMP header
3019                  */
3020                 trunc_len += get_isakmp_trunc_len(pkt, off + sizeof(struct udphdr),
3021                     remaining_caplen - sizeof(struct udphdr));
3022         } else if (dport == PORT_ISAKMP_NATT && sport == PORT_ISAKMP_NATT) {
3023                 trunc_len += get_isakmp_natt_trunc_len(pkt, off + sizeof(struct udphdr),
3024                     remaining_caplen - sizeof(struct udphdr));
3025         }
3026         if (trunc_len >= remaining_caplen) {
3027                 return remaining_caplen;
3028         }
3029
3030         return trunc_len;
3031 }
3032
3033 static uint16_t
3034 get_tcp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3035 {
3036         int err = 0;
3037         uint16_t trunc_len = sizeof(struct tcphdr); /* By default no TCP payload */
3038         if (trunc_len >= remaining_caplen) {
3039                 return remaining_caplen;
3040         }
3041
3042         struct tcphdr tcphdr;
3043         err = bpf_copydata(pkt, off, sizeof(struct tcphdr), &tcphdr);
3044         if (err != 0) {
3045                 return remaining_caplen;
3046         }
3047
3048         u_short sport, dport;
3049         sport = EXTRACT_SHORT(&tcphdr.th_sport);
3050         dport = EXTRACT_SHORT(&tcphdr.th_dport);
3051
3052         if (dport == PORT_DNS || sport == PORT_DNS) {
3053                 /*
3054                  * Full TCP payload  for DNS
3055                  */
3056                 trunc_len = remaining_caplen;
3057         } else {
3058                 trunc_len = tcphdr.th_off << 2;
3059         }
3060         if (trunc_len >= remaining_caplen) {
3061                 return remaining_caplen;
3062         }
3063
3064         return trunc_len;
3065 }
3066
3067 static uint16_t
3068 get_proto_trunc_len(uint8_t proto, struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3069 {
3070         uint16_t trunc_len;
3071
3072         switch (proto) {
3073         case IPPROTO_ICMP: {
3074                 /*
3075                  * Full IMCP payload
3076                  */
3077                 trunc_len = remaining_caplen;
3078                 break;
3079         }
3080         case IPPROTO_ICMPV6: {
3081                 /*
3082                  * Full IMCPV6 payload
3083                  */
3084                 trunc_len = remaining_caplen;
3085                 break;
3086         }
3087         case IPPROTO_IGMP: {
3088                 /*
3089                  * Full IGMP payload
3090                  */
3091                 trunc_len = remaining_caplen;
3092                 break;
3093         }
3094         case IPPROTO_UDP: {
3095                 trunc_len = get_udp_trunc_len(pkt, off, remaining_caplen);
3096                 break;
3097         }
3098         case IPPROTO_TCP: {
3099                 trunc_len = get_tcp_trunc_len(pkt, off, remaining_caplen);
3100                 break;
3101         }
3102         case IPPROTO_ESP: {
3103                 trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
3104                 break;
3105         }
3106         default: {
3107                 /*
3108                  * By default we only include the IP header
3109                  */
3110                 trunc_len = 0;
3111                 break;
3112         }
3113         }
3114         if (trunc_len >= remaining_caplen) {
3115                 return remaining_caplen;
3116         }
3117
3118         return trunc_len;
3119 }
3120
3121 static uint16_t
3122 get_ip_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3123 {
3124         int err = 0;
3125         uint16_t iplen = sizeof(struct ip);
3126         if (iplen >= remaining_caplen) {
3127                 return remaining_caplen;
3128         }
3129
3130         struct ip iphdr;
3131         err =  bpf_copydata(pkt, off, sizeof(struct ip), &iphdr);
3132         if (err != 0) {
3133                 return remaining_caplen;
3134         }
3135
3136         uint8_t proto = 0;
3137
3138         iplen = iphdr.ip_hl << 2;
3139         if (iplen >= remaining_caplen) {
3140                 return remaining_caplen;
3141         }
3142
3143         proto = iphdr.ip_p;
3144         iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
3145
3146         if (iplen >= remaining_caplen) {
3147                 return remaining_caplen;
3148         }
3149
3150         return iplen;
3151 }
3152
3153 static uint16_t
3154 get_ip6_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3155 {
3156         int err = 0;
3157         uint16_t iplen = sizeof(struct ip6_hdr);
3158         if (iplen >= remaining_caplen) {
3159                 return remaining_caplen;
3160         }
3161
3162         struct ip6_hdr ip6hdr;
3163         err = bpf_copydata(pkt, off, sizeof(struct ip6_hdr), &ip6hdr);
3164         if (err != 0) {
3165                 return remaining_caplen;
3166         }
3167
3168         uint8_t proto = 0;
3169
3170         /*
3171          * TBD: process the extension headers
3172          */
3173         proto = ip6hdr.ip6_nxt;
3174         iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
3175
3176         if (iplen >= remaining_caplen) {
3177                 return remaining_caplen;
3178         }
3179
3180         return iplen;
3181 }
3182
3183 static uint16_t
3184 get_ether_trunc_len(struct bpf_packet *pkt, int off, const uint16_t remaining_caplen)
3185 {
3186         int err = 0;
3187         uint16_t ethlen = sizeof(struct ether_header);
3188         if (ethlen >= remaining_caplen) {
3189                 return remaining_caplen;
3190         }
3191
3192         struct ether_header eh;
3193         u_short type;
3194         err = bpf_copydata(pkt, off, sizeof(struct ether_header), &eh);
3195         if (err != 0) {
3196                 return remaining_caplen;
3197         }
3198
3199         type = EXTRACT_SHORT(&eh.ether_type);
3200         /* Include full ARP */
3201         if (type == ETHERTYPE_ARP) {
3202                 ethlen = remaining_caplen;
3203         } else if (type != ETHERTYPE_IP && type != ETHERTYPE_IPV6) {
3204                 ethlen = min(BPF_MIN_PKT_SIZE, remaining_caplen);
3205         } else {
3206                 if (type == ETHERTYPE_IP) {
3207                         ethlen += get_ip_trunc_len(pkt, sizeof(struct ether_header),
3208                             remaining_caplen);
3209                 } else if (type == ETHERTYPE_IPV6) {
3210                         ethlen += get_ip6_trunc_len(pkt, sizeof(struct ether_header),
3211                             remaining_caplen);
3212                 }
3213         }
3214         return ethlen;
3215 }
3216
3217 static uint32_t
3218 get_pkt_trunc_len(u_char *p, u_int len)
3219 {
3220         struct bpf_packet *pkt = (struct bpf_packet *)(void *) p;
3221         struct pktap_header *pktap = (struct pktap_header *) (pkt->bpfp_header);
3222         uint32_t out_pkt_len = 0, tlen = 0;
3223         /*
3224          * pktap->pth_frame_pre_length is L2 header length and accounts
3225          * for both pre and pre_adjust.
3226          * pktap->pth_length is sizeof(pktap_header) (excl the pre/pre_adjust)
3227          * pkt->bpfp_header_length is (pktap->pth_length + pre_adjust)
3228          * pre is the offset to the L3 header after the bpfp_header, or length
3229          * of L2 header after bpfp_header, if present.
3230          */
3231         int32_t pre = pktap->pth_frame_pre_length -
3232             (pkt->bpfp_header_length - pktap->pth_length);
3233
3234         /* Length of the input packet starting from  L3 header */
3235         uint32_t in_pkt_len = len - pkt->bpfp_header_length - pre;
3236         if (pktap->pth_protocol_family == AF_INET ||
3237             pktap->pth_protocol_family == AF_INET6) {
3238                 /* Contains L2 header */
3239                 if (pre > 0) {
3240                         if (pre < (int32_t)sizeof(struct ether_header)) {
3241                                 goto too_short;
3242                         }
3243
3244                         out_pkt_len = get_ether_trunc_len(pkt, 0, in_pkt_len);
3245                 } else if (pre == 0) {
3246                         if (pktap->pth_protocol_family == AF_INET) {
3247                                 out_pkt_len = get_ip_trunc_len(pkt, pre, in_pkt_len);
3248                         } else if (pktap->pth_protocol_family == AF_INET6) {
3249                                 out_pkt_len = get_ip6_trunc_len(pkt, pre, in_pkt_len);
3250                         }
3251                 } else {
3252                         /* Ideally pre should be >= 0. This is an exception */
3253                         out_pkt_len = min(BPF_MIN_PKT_SIZE, in_pkt_len);
3254                 }
3255         } else {
3256                 if (pktap->pth_iftype == IFT_ETHER) {
3257                         if (in_pkt_len < sizeof(struct ether_header)) {
3258                                 goto too_short;
3259                         }
3260                         /* At most include the Ethernet header and 16 bytes */
3261                         out_pkt_len = MIN(sizeof(struct ether_header) + 16,
3262                             in_pkt_len);
3263                 } else {
3264                         /*
3265                          * For unknown protocols include at most 16 bytes
3266                          */
3267                         out_pkt_len = MIN(16, in_pkt_len);
3268                 }
3269         }
3270 done:
3271         tlen = pkt->bpfp_header_length + out_pkt_len + pre;
3272         return tlen;
3273 too_short:
3274         out_pkt_len = in_pkt_len;
3275         goto done;
3276 }
3277
3278 /*
3279  * Move the packet data from interface memory (pkt) into the
3280  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
3281  * otherwise 0.
3282  */
3283 static void
3284 catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
3285     u_int snaplen, int outbound)
3286 {
3287         struct bpf_hdr *hp;
3288         struct bpf_hdr_ext *ehp;
3289         int totlen, curlen;
3290         int hdrlen, caplen;
3291         int do_wakeup = 0;
3292         u_char *payload;
3293         struct timeval tv;
3294
3295         hdrlen = (d->bd_flags & BPF_EXTENDED_HDR) ? d->bd_bif->bif_exthdrlen :
3296             d->bd_bif->bif_hdrlen;
3297         /*
3298          * Figure out how many bytes to move.  If the packet is
3299          * greater or equal to the snapshot length, transfer that
3300          * much.  Otherwise, transfer the whole packet (unless
3301          * we hit the buffer size limit).
3302          */
3303         totlen = hdrlen + min(snaplen, pkt->bpfp_total_length);
3304         if (totlen > d->bd_bufsize) {
3305                 totlen = d->bd_bufsize;
3306         }
3307
3308         if (hdrlen > totlen) {
3309                 return;
3310         }
3311
3312         /*
3313          * Round up the end of the previous packet to the next longword.
3314          */
3315         curlen = BPF_WORDALIGN(d->bd_slen);
3316         if (curlen + totlen > d->bd_bufsize) {
3317                 /*
3318                  * This packet will overflow the storage buffer.
3319                  * Rotate the buffers if we can, then wakeup any
3320                  * pending reads.
3321                  *
3322                  * We cannot rotate buffers if a read is in progress
3323                  * so drop the packet
3324                  */
3325                 if (d->bd_hbuf_read != 0) {
3326                         ++d->bd_dcount;
3327                         return;
3328                 }
3329
3330                 if (d->bd_fbuf == NULL) {
3331                         if (d->bd_headdrop == 0) {
3332                                 /*
3333                                  * We haven't completed the previous read yet,
3334                                  * so drop the packet.
3335                                  */
3336                                 ++d->bd_dcount;
3337                                 return;
3338                         }
3339                         /*
3340                          * Drop the hold buffer as it contains older packets
3341                          */
3342                         d->bd_dcount += d->bd_hcnt;
3343                         d->bd_fbuf = d->bd_hbuf;
3344                         ROTATE_BUFFERS(d);
3345                 } else {
3346                         ROTATE_BUFFERS(d);
3347                 }
3348                 do_wakeup = 1;
3349                 curlen = 0;
3350         } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
3351                 /*
3352                  * Immediate mode is set, or the read timeout has
3353                  * already expired during a select call. A packet
3354                  * arrived, so the reader should be woken up.
3355                  */
3356                 do_wakeup = 1;
3357         }
3358
3359         /*
3360          * Append the bpf header.
3361          */
3362         microtime(&tv);
3363         if (d->bd_flags & BPF_EXTENDED_HDR) {
3364                 struct mbuf *m;
3365
3366                 m = (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF)
3367                     ? pkt->bpfp_mbuf : NULL;
3368                 ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen);
3369                 memset(ehp, 0, sizeof(*ehp));
3370                 ehp->bh_tstamp.tv_sec = tv.tv_sec;
3371                 ehp->bh_tstamp.tv_usec = tv.tv_usec;
3372
3373                 ehp->bh_datalen = pkt->bpfp_total_length;
3374                 ehp->bh_hdrlen = hdrlen;
3375                 caplen = ehp->bh_caplen = totlen - hdrlen;
3376                 if (m == NULL) {
3377                         if (outbound) {
3378                                 ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT;
3379                         } else {
3380                                 ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN;
3381                         }
3382                 } else if (outbound) {
3383                         ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT;
3384
3385                         /* only do lookups on non-raw INPCB */
3386                         if ((m->m_pkthdr.pkt_flags & (PKTF_FLOW_ID |
3387                             PKTF_FLOW_LOCALSRC | PKTF_FLOW_RAWSOCK)) ==
3388                             (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC) &&
3389                             m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
3390                                 ehp->bh_flowid = m->m_pkthdr.pkt_flowid;
3391                                 ehp->bh_proto = m->m_pkthdr.pkt_proto;
3392                         }
3393                         ehp->bh_svc = so_svc2tc(m->m_pkthdr.pkt_svc);
3394                         if (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) {
3395                                 ehp->bh_pktflags |= BPF_PKTFLAGS_TCP_REXMT;
3396                         }
3397                         if (m->m_pkthdr.pkt_flags & PKTF_START_SEQ) {
3398                                 ehp->bh_pktflags |= BPF_PKTFLAGS_START_SEQ;
3399                         }
3400                         if (m->m_pkthdr.pkt_flags & PKTF_LAST_PKT) {
3401                                 ehp->bh_pktflags |= BPF_PKTFLAGS_LAST_PKT;
3402                         }
3403                         if (m->m_pkthdr.pkt_flags & PKTF_VALID_UNSENT_DATA) {
3404                                 ehp->bh_unsent_bytes =
3405                                     m->m_pkthdr.bufstatus_if;
3406                                 ehp->bh_unsent_snd =
3407                                     m->m_pkthdr.bufstatus_sndbuf;
3408                         }
3409                 } else {
3410                         ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN;
3411                 }
3412                 payload = (u_char *)ehp + hdrlen;
3413         } else {
3414                 hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen);
3415                 hp->bh_tstamp.tv_sec = tv.tv_sec;
3416                 hp->bh_tstamp.tv_usec = tv.tv_usec;
3417                 hp->bh_datalen = pkt->bpfp_total_length;
3418                 hp->bh_hdrlen = hdrlen;
3419                 caplen = hp->bh_caplen = totlen - hdrlen;
3420                 payload = (u_char *)hp + hdrlen;
3421         }
3422         /*
3423          * Copy the packet data into the store buffer and update its length.
3424          */
3425         copy_bpf_packet(pkt, payload, caplen);
3426         d->bd_slen = curlen + totlen;
3427         d->bd_scnt += 1;
3428
3429         if (do_wakeup) {
3430                 bpf_wakeup(d);
3431         }
3432 }
3433
3434 /*
3435  * Initialize all nonzero fields of a descriptor.
3436  */
3437 static int
3438 bpf_allocbufs(struct bpf_d *d)
3439 {
3440         if (d->bd_sbuf != NULL) {
3441                 FREE(d->bd_sbuf, M_DEVBUF);
3442                 d->bd_sbuf = NULL;
3443         }
3444         if (d->bd_hbuf != NULL) {
3445                 FREE(d->bd_hbuf, M_DEVBUF);
3446                 d->bd_hbuf = NULL;
3447         }
3448         if (d->bd_fbuf != NULL) {
3449                 FREE(d->bd_fbuf, M_DEVBUF);
3450                 d->bd_fbuf = NULL;
3451         }
3452
3453         d->bd_fbuf = (caddr_t) _MALLOC(d->bd_bufsize, M_DEVBUF, M_WAIT);
3454         if (d->bd_fbuf == NULL) {
3455                 return ENOBUFS;
3456         }
3457
3458         d->bd_sbuf = (caddr_t) _MALLOC(d->bd_bufsize, M_DEVBUF, M_WAIT);
3459         if (d->bd_sbuf == NULL) {
3460                 FREE(d->bd_fbuf, M_DEVBUF);
3461                 d->bd_fbuf = NULL;
3462                 return ENOBUFS;
3463         }
3464         d->bd_slen = 0;
3465         d->bd_hlen = 0;
3466         d->bd_scnt = 0;
3467         d->bd_hcnt = 0;
3468         return 0;
3469 }
3470
3471 /*
3472  * Free buffers currently in use by a descriptor.
3473  * Called on close.
3474  */
3475 static void
3476 bpf_freed(struct bpf_d *d)
3477 {
3478         /*
3479          * We don't need to lock out interrupts since this descriptor has
3480          * been detached from its interface and it yet hasn't been marked
3481          * free.
3482          */
3483         if (d->bd_hbuf_read != 0) {
3484                 panic("bpf buffer freed during read");
3485         }
3486
3487         if (d->bd_sbuf != 0) {
3488                 FREE(d->bd_sbuf, M_DEVBUF);
3489                 if (d->bd_hbuf != 0) {
3490                         FREE(d->bd_hbuf, M_DEVBUF);
3491                 }
3492                 if (d->bd_fbuf != 0) {
3493                         FREE(d->bd_fbuf, M_DEVBUF);
3494                 }
3495         }
3496         if (d->bd_filter) {
3497                 FREE(d->bd_filter, M_DEVBUF);
3498         }
3499 }
3500
3501 /*
3502  * Attach an interface to bpf.  driverp is a pointer to a (struct bpf_if *)
3503  * in the driver's softc; dlt is the link layer type; hdrlen is the fixed
3504  * size of the link header (variable length headers not yet supported).
3505  */
3506 void
3507 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3508 {
3509         bpf_attach(ifp, dlt, hdrlen, NULL, NULL);
3510 }
3511
3512 errno_t
3513 bpf_attach(
3514         ifnet_t                 ifp,
3515         u_int32_t               dlt,
3516         u_int32_t               hdrlen,
3517         bpf_send_func   send,
3518         bpf_tap_func    tap)
3519 {
3520         struct bpf_if *bp;
3521         struct bpf_if *bp_new;
3522         struct bpf_if *bp_before_first = NULL;
3523         struct bpf_if *bp_first = NULL;
3524         struct bpf_if *bp_last = NULL;
3525         boolean_t found;
3526
3527         bp_new = (struct bpf_if *) _MALLOC(sizeof(*bp_new), M_DEVBUF,
3528             M_WAIT | M_ZERO);
3529         if (bp_new == 0) {
3530                 panic("bpfattach");
3531         }
3532
3533         lck_mtx_lock(bpf_mlock);
3534
3535         /*
3536          * Check if this interface/dlt is already attached. Remember the
3537          * first and last attachment for this interface, as well as the
3538          * element before the first attachment.
3539          */
3540         found = FALSE;
3541         for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
3542                 if (bp->bif_ifp != ifp) {
3543                         if (bp_first != NULL) {
3544                                 /* no more elements for this interface */
3545                                 break;
3546                         }
3547                         bp_before_first = bp;
3548                 } else {
3549                         if (bp->bif_dlt == dlt) {
3550                                 found = TRUE;
3551                                 break;
3552                         }
3553                         if (bp_first == NULL) {
3554                                 bp_first = bp;
3555                         }
3556                         bp_last = bp;
3557                 }
3558         }
3559         if (found) {
3560                 lck_mtx_unlock(bpf_mlock);
3561                 printf("bpfattach - %s with dlt %d is already attached\n",
3562                     if_name(ifp), dlt);
3563                 FREE(bp_new, M_DEVBUF);
3564                 return EEXIST;
3565         }
3566
3567         bp_new->bif_ifp = ifp;
3568         bp_new->bif_dlt = dlt;
3569         bp_new->bif_send = send;
3570         bp_new->bif_tap = tap;
3571
3572         if (bp_first == NULL) {
3573                 /* No other entries for this ifp */
3574                 bp_new->bif_next = bpf_iflist;
3575                 bpf_iflist = bp_new;
3576         } else {
3577                 if (ifnet_type(ifp) == IFT_ETHER && dlt == DLT_EN10MB) {
3578                         /* Make this the first entry for this interface */
3579                         if (bp_before_first != NULL) {
3580                                 /*  point the previous to us */
3581                                 bp_before_first->bif_next = bp_new;
3582                         } else {
3583                                 /* we're the new head */
3584                                 bpf_iflist = bp_new;
3585                         }
3586                         bp_new->bif_next = bp_first;
3587                 } else {
3588                         /* Add this after the last entry for this interface */
3589                         bp_new->bif_next = bp_last->bif_next;
3590                         bp_last->bif_next = bp_new;
3591                 }
3592         }
3593
3594         /*
3595          * Compute the length of the bpf header.  This is not necessarily
3596          * equal to SIZEOF_BPF_HDR because we want to insert spacing such
3597          * that the network layer header begins on a longword boundary (for
3598          * performance reasons and to alleviate alignment restrictions).
3599          */
3600         bp_new->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
3601         bp_new->bif_exthdrlen = BPF_WORDALIGN(hdrlen +
3602             sizeof(struct bpf_hdr_ext)) - hdrlen;
3603
3604         /* Take a reference on the interface */
3605         ifnet_reference(ifp);
3606
3607         lck_mtx_unlock(bpf_mlock);
3608
3609 #ifndef __APPLE__
3610         if (bootverbose) {
3611                 printf("bpf: %s attached\n", if_name(ifp));
3612         }
3613 #endif
3614
3615         return 0;
3616 }
3617
3618 /*
3619  * Detach bpf from an interface.  This involves detaching each descriptor
3620  * associated with the interface, and leaving bd_bif NULL.  Notify each
3621  * descriptor as it's detached so that any sleepers wake up and get
3622  * ENXIO.
3623  */
3624 void
3625 bpfdetach(struct ifnet *ifp)
3626 {
3627         struct bpf_if   *bp, *bp_prev, *bp_next;
3628         struct bpf_d    *d;
3629
3630         if (bpf_debug != 0) {
3631                 printf("%s: %s\n", __func__, if_name(ifp));
3632         }
3633
3634         lck_mtx_lock(bpf_mlock);
3635
3636         /*
3637          * Build the list of devices attached to that interface
3638          * that we need to free while keeping the lock to maintain
3639          * the integrity of the interface list
3640          */
3641         bp_prev = NULL;
3642         for (bp = bpf_iflist; bp != NULL; bp = bp_next) {
3643                 bp_next = bp->bif_next;
3644
3645                 if (ifp != bp->bif_ifp) {
3646                         bp_prev = bp;
3647                         continue;
3648                 }
3649                 /* Unlink from the interface list */
3650                 if (bp_prev) {
3651                         bp_prev->bif_next = bp->bif_next;
3652                 } else {
3653                         bpf_iflist = bp->bif_next;
3654                 }
3655
3656                 /* Detach the devices attached to the interface */
3657                 while ((d = bp->bif_dlist) != NULL) {
3658                         /*
3659                          * Take an extra reference to prevent the device
3660                          * from being freed when bpf_detachd() releases
3661                          * the reference for the interface list
3662                          */
3663                         bpf_acquire_d(d);
3664                         bpf_detachd(d, 0);
3665                         bpf_wakeup(d);
3666                         bpf_release_d(d);
3667                 }
3668                 ifnet_release(ifp);
3669         }
3670
3671         lck_mtx_unlock(bpf_mlock);
3672 }
3673
3674 void
3675 bpf_init(__unused void *unused)
3676 {
3677 #ifdef __APPLE__
3678         int     i;
3679         int     maj;
3680
3681         if (bpf_devsw_installed == 0) {
3682                 bpf_devsw_installed = 1;
3683                 bpf_mlock_grp_attr = lck_grp_attr_alloc_init();
3684                 bpf_mlock_grp = lck_grp_alloc_init("bpf", bpf_mlock_grp_attr);
3685                 bpf_mlock_attr = lck_attr_alloc_init();
3686                 lck_mtx_init(bpf_mlock, bpf_mlock_grp, bpf_mlock_attr);
3687                 maj = cdevsw_add(CDEV_MAJOR, &bpf_cdevsw);
3688                 if (maj == -1) {
3689                         if (bpf_mlock_attr) {
3690                                 lck_attr_free(bpf_mlock_attr);
3691                         }
3692                         if (bpf_mlock_grp) {
3693                                 lck_grp_free(bpf_mlock_grp);
3694                         }
3695                         if (bpf_mlock_grp_attr) {
3696                                 lck_grp_attr_free(bpf_mlock_grp_attr);
3697                         }
3698
3699                         bpf_mlock = NULL;
3700                         bpf_mlock_attr = NULL;
3701                         bpf_mlock_grp = NULL;
3702                         bpf_mlock_grp_attr = NULL;
3703                         bpf_devsw_installed = 0;
3704                         printf("bpf_init: failed to allocate a major number\n");
3705                         return;
3706                 }
3707
3708                 for (i = 0; i < NBPFILTER; i++) {
3709                         bpf_make_dev_t(maj);
3710                 }
3711         }
3712 #else
3713         cdevsw_add(&bpf_cdevsw);
3714 #endif
3715 }
3716
3717 #ifndef __APPLE__
3718 SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL);
3719 #endif
3720
3721 #if CONFIG_MACF_NET
3722 struct label *
3723 mac_bpfdesc_label_get(struct bpf_d *d)
3724 {
3725         return d->bd_label;
3726 }
3727
3728 void
3729 mac_bpfdesc_label_set(struct bpf_d *d, struct label *label)
3730 {
3731         d->bd_label = label;
3732 }
3733 #endif
3734
3735 static int
3736 sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS
3737 {
3738 #pragma unused(arg1, arg2)
3739         int i, err;
3740
3741         i = bpf_maxbufsize;
3742
3743         err = sysctl_handle_int(oidp, &i, 0, req);
3744         if (err != 0 || req->newptr == USER_ADDR_NULL) {
3745                 return err;
3746         }
3747
3748         if (i < 0 || i > BPF_MAXSIZE_CAP) {
3749                 i = BPF_MAXSIZE_CAP;
3750         }
3751
3752         bpf_maxbufsize = i;
3753         return err;
3754 }