bsd/net/bpf.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1990, 1991, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from the Stanford/CMU enet packet filter,
  33  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  34  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  35  * Berkeley Laboratory.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)bpf.c       8.2 (Berkeley) 3/28/94
  66  *
  67  * $FreeBSD: src/sys/net/bpf.c,v 1.59.2.5 2001/01/05 04:49:09 jdp Exp $
  68  */
  69 /*
  70  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  71  * support for mandatory and extensible security protections.  This notice
  72  * is included in support of clause 2.2 (b) of the Apple Public License,
  73  * Version 2.0.
  74  */
  75
  76 #include "bpf.h"
  77
  78 #ifndef __GNUC__
  79 #define inline
  80 #else
  81 #define inline __inline
  82 #endif
  83
  84 #include <sys/param.h>
  85 #include <sys/systm.h>
  86 #include <sys/conf.h>
  87 #include <sys/malloc.h>
  88 #include <sys/mbuf.h>
  89 #include <sys/time.h>
  90 #include <sys/proc.h>
  91 #include <sys/signalvar.h>
  92 #include <sys/filio.h>
  93 #include <sys/sockio.h>
  94 #include <sys/ttycom.h>
  95 #include <sys/filedesc.h>
  96 #include <sys/uio_internal.h>
  97 #include <sys/file_internal.h>
  98 #include <sys/event.h>
  99
 100 #include <sys/poll.h>
 101
 102 #include <sys/socket.h>
 103 #include <sys/socketvar.h>
 104 #include <sys/vnode.h>
 105
 106 #include <net/if.h>
 107 #include <net/bpf.h>
 108 #include <net/bpfdesc.h>
 109
 110 #include <netinet/in.h>
 111 #include <netinet/ip.h>
 112 #include <netinet/ip6.h>
 113 #include <netinet/in_pcb.h>
 114 #include <netinet/in_var.h>
 115 #include <netinet/ip_var.h>
 116 #include <netinet/tcp.h>
 117 #include <netinet/tcp_var.h>
 118 #include <netinet/udp.h>
 119 #include <netinet/udp_var.h>
 120 #include <netinet/if_ether.h>
 121 #include <netinet/isakmp.h>
 122 #include <netinet6/esp.h>
 123 #include <sys/kernel.h>
 124 #include <sys/sysctl.h>
 125 #include <net/firewire.h>
 126
 127 #include <miscfs/devfs/devfs.h>
 128 #include <net/dlil.h>
 129 #include <net/pktap.h>
 130
 131 #include <kern/locks.h>
 132 #include <kern/thread_call.h>
 133 #include <libkern/section_keywords.h>
 134
 135 #include <os/log.h>
 136
 137 extern int tvtohz(struct timeval *);
 138
 139 #define BPF_BUFSIZE 4096
 140 #define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio)
 141
 142 #define PRINET  26                      /* interruptible */
 143
 144 #define ISAKMP_HDR_SIZE (sizeof(struct isakmp) + sizeof(struct isakmp_gen))
 145 #define ESP_HDR_SIZE sizeof(struct newesp)
 146
 147 typedef void (*pktcopyfunc_t)(const void *, void *, size_t);
 148
 149 /*
 150  * The default read buffer size is patchable.
 151  */
 152 static unsigned int bpf_bufsize = BPF_BUFSIZE;
 153 SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 154     &bpf_bufsize, 0, "");
 155
 156 static int sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS;
 157 extern const int copysize_limit_panic;
 158 #define BPF_MAXSIZE_CAP (copysize_limit_panic >> 1)
 159 __private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE;
 160 SYSCTL_PROC(_debug, OID_AUTO, bpf_maxbufsize, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 161     &bpf_maxbufsize, 0,
 162     sysctl_bpf_maxbufsize, "I", "Default BPF max buffer size");
 163
 164 static unsigned int bpf_maxdevices = 256;
 165 SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED,
 166     &bpf_maxdevices, 0, "");
 167 /*
 168  * bpf_wantpktap controls the defaul visibility of DLT_PKTAP
 169  * For OS X is off by default so process need to use the ioctl BPF_WANT_PKTAP
 170  * explicitly to be able to use DLT_PKTAP.
 171  */
 172 #if !XNU_TARGET_OS_OSX
 173 static unsigned int bpf_wantpktap = 1;
 174 #else /* XNU_TARGET_OS_OSX */
 175 static unsigned int bpf_wantpktap = 0;
 176 #endif /* XNU_TARGET_OS_OSX */
 177 SYSCTL_UINT(_debug, OID_AUTO, bpf_wantpktap, CTLFLAG_RW | CTLFLAG_LOCKED,
 178     &bpf_wantpktap, 0, "");
 179
 180 static int bpf_debug = 0;
 181 SYSCTL_INT(_debug, OID_AUTO, bpf_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 182     &bpf_debug, 0, "");
 183
 184 /*
 185  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
 186  *  bpf_dtab holds pointer to the descriptors, indexed by minor device #
 187  */
 188 static struct bpf_if    *bpf_iflist;
 189 #ifdef __APPLE__
 190 /*
 191  * BSD now stores the bpf_d in the dev_t which is a struct
 192  * on their system. Our dev_t is an int, so we still store
 193  * the bpf_d in a separate table indexed by minor device #.
 194  *
 195  * The value stored in bpf_dtab[n] represent three states:
 196  *  NULL: device not opened
 197  *  BPF_DEV_RESERVED: device opening or closing
 198  *  other: device <n> opened with pointer to storage
 199  */
 200 #define BPF_DEV_RESERVED ((struct bpf_d *)(uintptr_t)1)
 201 static struct bpf_d     **bpf_dtab = NULL;
 202 static unsigned int bpf_dtab_size = 0;
 203 static unsigned int     nbpfilter = 0;
 204
 205 decl_lck_mtx_data(static, bpf_mlock_data);
 206 static lck_mtx_t                *bpf_mlock = &bpf_mlock_data;
 207 static lck_grp_t                *bpf_mlock_grp;
 208 static lck_grp_attr_t   *bpf_mlock_grp_attr;
 209 static lck_attr_t               *bpf_mlock_attr;
 210
 211 #endif /* __APPLE__ */
 212
 213 static int      bpf_allocbufs(struct bpf_d *);
 214 static errno_t  bpf_attachd(struct bpf_d *d, struct bpf_if *bp);
 215 static int      bpf_detachd(struct bpf_d *d, int);
 216 static void     bpf_freed(struct bpf_d *);
 217 static int      bpf_movein(struct uio *, int,
 218     struct mbuf **, struct sockaddr *, int *);
 219 static int      bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool);
 220 static void     bpf_timed_out(void *, void *);
 221 static void     bpf_wakeup(struct bpf_d *);
 222 static u_int    get_pkt_trunc_len(u_char *, u_int);
 223 static void     catchpacket(struct bpf_d *, struct bpf_packet *, u_int, int);
 224 static void     reset_d(struct bpf_d *);
 225 static int      bpf_setf(struct bpf_d *, u_int, user_addr_t, u_long);
 226 static int      bpf_getdltlist(struct bpf_d *, caddr_t, struct proc *);
 227 static int      bpf_setdlt(struct bpf_d *, u_int);
 228 static int      bpf_set_traffic_class(struct bpf_d *, int);
 229 static void     bpf_set_packet_service_class(struct mbuf *, int);
 230
 231 static void     bpf_acquire_d(struct bpf_d *);
 232 static void     bpf_release_d(struct bpf_d *);
 233
 234 static  int bpf_devsw_installed;
 235
 236 void bpf_init(void *unused);
 237 static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m);
 238
 239 /*
 240  * Darwin differs from BSD here, the following are static
 241  * on BSD and not static on Darwin.
 242  */
 243 d_open_t            bpfopen;
 244 d_close_t           bpfclose;
 245 d_read_t            bpfread;
 246 d_write_t           bpfwrite;
 247 ioctl_fcn_t         bpfioctl;
 248 select_fcn_t        bpfselect;
 249
 250 /* Darwin's cdevsw struct differs slightly from BSDs */
 251 #define CDEV_MAJOR 23
 252 static const struct cdevsw bpf_cdevsw = {
 253         .d_open       = bpfopen,
 254         .d_close      = bpfclose,
 255         .d_read       = bpfread,
 256         .d_write      = bpfwrite,
 257         .d_ioctl      = bpfioctl,
 258         .d_stop       = eno_stop,
 259         .d_reset      = eno_reset,
 260         .d_ttys       = NULL,
 261         .d_select     = bpfselect,
 262         .d_mmap       = eno_mmap,
 263         .d_strategy   = eno_strat,
 264         .d_reserved_1 = eno_getc,
 265         .d_reserved_2 = eno_putc,
 266         .d_type       = 0
 267 };
 268
 269 #define SOCKADDR_HDR_LEN           offsetof(struct sockaddr, sa_data)
 270
 271 static int
 272 bpf_movein(struct uio *uio, int linktype, struct mbuf **mp,
 273     struct sockaddr *sockp, int *datlen)
 274 {
 275         struct mbuf *m;
 276         int error;
 277         int len;
 278         uint8_t sa_family;
 279         int hlen;
 280
 281         switch (linktype) {
 282 #if SLIP
 283         case DLT_SLIP:
 284                 sa_family = AF_INET;
 285                 hlen = 0;
 286                 break;
 287 #endif /* SLIP */
 288
 289         case DLT_EN10MB:
 290                 sa_family = AF_UNSPEC;
 291                 /* XXX Would MAXLINKHDR be better? */
 292                 hlen = sizeof(struct ether_header);
 293                 break;
 294
 295 #if FDDI
 296         case DLT_FDDI:
 297 #if defined(__FreeBSD__) || defined(__bsdi__)
 298                 sa_family = AF_IMPLINK;
 299                 hlen = 0;
 300 #else
 301                 sa_family = AF_UNSPEC;
 302                 /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */
 303                 hlen = 24;
 304 #endif
 305                 break;
 306 #endif /* FDDI */
 307
 308         case DLT_RAW:
 309         case DLT_NULL:
 310                 sa_family = AF_UNSPEC;
 311                 hlen = 0;
 312                 break;
 313
 314 #ifdef __FreeBSD__
 315         case DLT_ATM_RFC1483:
 316                 /*
 317                  * en atm driver requires 4-byte atm pseudo header.
 318                  * though it isn't standard, vpi:vci needs to be
 319                  * specified anyway.
 320                  */
 321                 sa_family = AF_UNSPEC;
 322                 hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 323                 break;
 324 #endif
 325
 326         case DLT_PPP:
 327                 sa_family = AF_UNSPEC;
 328                 hlen = 4;       /* This should match PPP_HDRLEN */
 329                 break;
 330
 331         case DLT_APPLE_IP_OVER_IEEE1394:
 332                 sa_family = AF_UNSPEC;
 333                 hlen = sizeof(struct firewire_header);
 334                 break;
 335
 336         case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
 337                 sa_family = AF_IEEE80211;
 338                 hlen = 0;
 339                 break;
 340
 341         case DLT_IEEE802_11_RADIO:
 342                 sa_family = AF_IEEE80211;
 343                 hlen = 0;
 344                 break;
 345
 346         default:
 347                 return EIO;
 348         }
 349
 350         // LP64todo - fix this!
 351         len = uio_resid(uio);
 352         *datlen = len - hlen;
 353         if ((unsigned)len > MCLBYTES) {
 354                 return EIO;
 355         }
 356
 357         if (sockp) {
 358                 /*
 359                  * Build a sockaddr based on the data link layer type.
 360                  * We do this at this level because the ethernet header
 361                  * is copied directly into the data field of the sockaddr.
 362                  * In the case of SLIP, there is no header and the packet
 363                  * is forwarded as is.
 364                  * Also, we are careful to leave room at the front of the mbuf
 365                  * for the link level header.
 366                  */
 367                 if ((hlen + SOCKADDR_HDR_LEN) > sockp->sa_len) {
 368                         return EIO;
 369                 }
 370                 sockp->sa_family = sa_family;
 371         } else {
 372                 /*
 373                  * We're directly sending the packet data supplied by
 374                  * the user; we don't need to make room for the link
 375                  * header, and don't need the header length value any
 376                  * more, so set it to 0.
 377                  */
 378                 hlen = 0;
 379         }
 380
 381         MGETHDR(m, M_WAIT, MT_DATA);
 382         if (m == 0) {
 383                 return ENOBUFS;
 384         }
 385         if ((unsigned)len > MHLEN) {
 386                 MCLGET(m, M_WAIT);
 387                 if ((m->m_flags & M_EXT) == 0) {
 388                         error = ENOBUFS;
 389                         goto bad;
 390                 }
 391         }
 392         m->m_pkthdr.len = m->m_len = len;
 393         m->m_pkthdr.rcvif = NULL;
 394         *mp = m;
 395
 396         /*
 397          * Make room for link header.
 398          */
 399         if (hlen != 0) {
 400                 m->m_pkthdr.len -= hlen;
 401                 m->m_len -= hlen;
 402                 m->m_data += hlen; /* XXX */
 403                 error = UIOMOVE((caddr_t)sockp->sa_data, hlen, UIO_WRITE, uio);
 404                 if (error) {
 405                         goto bad;
 406                 }
 407         }
 408         error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio);
 409         if (error) {
 410                 goto bad;
 411         }
 412
 413         /* Check for multicast destination */
 414         switch (linktype) {
 415         case DLT_EN10MB: {
 416                 struct ether_header *eh;
 417
 418                 eh = mtod(m, struct ether_header *);
 419                 if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 420                         if (_ether_cmp(etherbroadcastaddr,
 421                             eh->ether_dhost) == 0) {
 422                                 m->m_flags |= M_BCAST;
 423                         } else {
 424                                 m->m_flags |= M_MCAST;
 425                         }
 426                 }
 427                 break;
 428         }
 429         }
 430
 431         return 0;
 432 bad:
 433         m_freem(m);
 434         return error;
 435 }
 436
 437 #ifdef __APPLE__
 438
 439 /*
 440  * The dynamic addition of a new device node must block all processes that
 441  * are opening the last device so that no process will get an unexpected
 442  * ENOENT
 443  */
 444 static void
 445 bpf_make_dev_t(int maj)
 446 {
 447         static int              bpf_growing = 0;
 448         unsigned int    cur_size = nbpfilter, i;
 449
 450         if (nbpfilter >= bpf_maxdevices) {
 451                 return;
 452         }
 453
 454         while (bpf_growing) {
 455                 /* Wait until new device has been created */
 456                 (void) tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0);
 457         }
 458         if (nbpfilter > cur_size) {
 459                 /* other thread grew it already */
 460                 return;
 461         }
 462         bpf_growing = 1;
 463
 464         /* need to grow bpf_dtab first */
 465         if (nbpfilter == bpf_dtab_size) {
 466                 int new_dtab_size;
 467                 struct bpf_d **new_dtab = NULL;
 468                 struct bpf_d **old_dtab = NULL;
 469
 470                 new_dtab_size = bpf_dtab_size + NBPFILTER;
 471                 new_dtab = (struct bpf_d **)_MALLOC(
 472                         sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT);
 473                 if (new_dtab == 0) {
 474                         printf("bpf_make_dev_t: malloc bpf_dtab failed\n");
 475                         goto done;
 476                 }
 477                 if (bpf_dtab) {
 478                         bcopy(bpf_dtab, new_dtab,
 479                             sizeof(struct bpf_d *) * bpf_dtab_size);
 480                 }
 481                 bzero(new_dtab + bpf_dtab_size,
 482                     sizeof(struct bpf_d *) * NBPFILTER);
 483                 old_dtab = bpf_dtab;
 484                 bpf_dtab = new_dtab;
 485                 bpf_dtab_size = new_dtab_size;
 486                 if (old_dtab != NULL) {
 487                         _FREE(old_dtab, M_DEVBUF);
 488                 }
 489         }
 490         i = nbpfilter++;
 491         (void) devfs_make_node(makedev(maj, i),
 492             DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600,
 493             "bpf%d", i);
 494 done:
 495         bpf_growing = 0;
 496         wakeup((caddr_t)&bpf_growing);
 497 }
 498
 499 #endif
 500
 501 /*
 502  * Attach file to the bpf interface, i.e. make d listen on bp.
 503  */
 504 static errno_t
 505 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 506 {
 507         int first = bp->bif_dlist == NULL;
 508         int     error = 0;
 509
 510         /*
 511          * Point d at bp, and add d to the interface's list of listeners.
 512          * Finally, point the driver's bpf cookie at the interface so
 513          * it will divert packets to bpf.
 514          */
 515         d->bd_bif = bp;
 516         d->bd_next = bp->bif_dlist;
 517         bp->bif_dlist = d;
 518
 519         /*
 520          * Take a reference on the device even if an error is returned
 521          * because we keep the device in the interface's list of listeners
 522          */
 523         bpf_acquire_d(d);
 524
 525         if (first) {
 526                 /* Find the default bpf entry for this ifp */
 527                 if (bp->bif_ifp->if_bpf == NULL) {
 528                         struct bpf_if   *tmp, *primary = NULL;
 529
 530                         for (tmp = bpf_iflist; tmp; tmp = tmp->bif_next) {
 531                                 if (tmp->bif_ifp == bp->bif_ifp) {
 532                                         primary = tmp;
 533                                         break;
 534                                 }
 535                         }
 536                         bp->bif_ifp->if_bpf = primary;
 537                 }
 538                 /* Only call dlil_set_bpf_tap for primary dlt */
 539                 if (bp->bif_ifp->if_bpf == bp) {
 540                         dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT,
 541                             bpf_tap_callback);
 542                 }
 543
 544                 if (bp->bif_tap != NULL) {
 545                         error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt,
 546                             BPF_TAP_INPUT_OUTPUT);
 547                 }
 548         }
 549
 550         /*
 551          * Reset the detach flags in case we previously detached an interface
 552          */
 553         d->bd_flags &= ~(BPF_DETACHING | BPF_DETACHED);
 554
 555         if (bp->bif_dlt == DLT_PKTAP) {
 556                 d->bd_flags |= BPF_FINALIZE_PKTAP;
 557         } else {
 558                 d->bd_flags &= ~BPF_FINALIZE_PKTAP;
 559         }
 560         return error;
 561 }
 562
 563 /*
 564  * Detach a file from its interface.
 565  *
 566  * Return 1 if was closed by some thread, 0 otherwise
 567  */
 568 static int
 569 bpf_detachd(struct bpf_d *d, int closing)
 570 {
 571         struct bpf_d **p;
 572         struct bpf_if *bp;
 573         struct ifnet  *ifp;
 574
 575         int bpf_closed = d->bd_flags & BPF_CLOSING;
 576         /*
 577          * Some other thread already detached
 578          */
 579         if ((d->bd_flags & (BPF_DETACHED | BPF_DETACHING)) != 0) {
 580                 goto done;
 581         }
 582         /*
 583          * This thread is doing the detach
 584          */
 585         d->bd_flags |= BPF_DETACHING;
 586
 587         ifp = d->bd_bif->bif_ifp;
 588         bp = d->bd_bif;
 589
 590         if (bpf_debug != 0) {
 591                 printf("%s: %llx %s%s\n",
 592                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d),
 593                     if_name(ifp), closing ? " closing" : "");
 594         }
 595
 596         /* Remove d from the interface's descriptor list. */
 597         p = &bp->bif_dlist;
 598         while (*p != d) {
 599                 p = &(*p)->bd_next;
 600                 if (*p == 0) {
 601                         panic("bpf_detachd: descriptor not in list");
 602                 }
 603         }
 604         *p = (*p)->bd_next;
 605         if (bp->bif_dlist == 0) {
 606                 /*
 607                  * Let the driver know that there are no more listeners.
 608                  */
 609                 /* Only call dlil_set_bpf_tap for primary dlt */
 610                 if (bp->bif_ifp->if_bpf == bp) {
 611                         dlil_set_bpf_tap(ifp, BPF_TAP_DISABLE, NULL);
 612                 }
 613                 if (bp->bif_tap) {
 614                         bp->bif_tap(ifp, bp->bif_dlt, BPF_TAP_DISABLE);
 615                 }
 616
 617                 for (bp = bpf_iflist; bp; bp = bp->bif_next) {
 618                         if (bp->bif_ifp == ifp && bp->bif_dlist != 0) {
 619                                 break;
 620                         }
 621                 }
 622                 if (bp == NULL) {
 623                         ifp->if_bpf = NULL;
 624                 }
 625         }
 626         d->bd_bif = NULL;
 627         /*
 628          * Check if this descriptor had requested promiscuous mode.
 629          * If so, turn it off.
 630          */
 631         if (d->bd_promisc) {
 632                 d->bd_promisc = 0;
 633                 lck_mtx_unlock(bpf_mlock);
 634                 if (ifnet_set_promiscuous(ifp, 0)) {
 635                         /*
 636                          * Something is really wrong if we were able to put
 637                          * the driver into promiscuous mode, but can't
 638                          * take it out.
 639                          * Most likely the network interface is gone.
 640                          */
 641                         printf("%s: ifnet_set_promiscuous failed\n", __func__);
 642                 }
 643                 lck_mtx_lock(bpf_mlock);
 644         }
 645
 646         /*
 647          * Wake up other thread that are waiting for this thread to finish
 648          * detaching
 649          */
 650         d->bd_flags &= ~BPF_DETACHING;
 651         d->bd_flags |= BPF_DETACHED;
 652
 653         /* Refresh the local variable as d could have been modified */
 654         bpf_closed = d->bd_flags & BPF_CLOSING;
 655         /*
 656          * Note that We've kept the reference because we may have dropped
 657          * the lock when turning off promiscuous mode
 658          */
 659         bpf_release_d(d);
 660
 661 done:
 662         /*
 663          * When closing makes sure no other thread refer to the bpf_d
 664          */
 665         if (bpf_debug != 0) {
 666                 printf("%s: %llx done\n",
 667                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
 668         }
 669         /*
 670          * Let the caller know the bpf_d is closed
 671          */
 672         if (bpf_closed) {
 673                 return 1;
 674         } else {
 675                 return 0;
 676         }
 677 }
 678
 679 /*
 680  * Start asynchronous timer, if necessary.
 681  * Must be called with bpf_mlock held.
 682  */
 683 static void
 684 bpf_start_timer(struct bpf_d *d)
 685 {
 686         uint64_t deadline;
 687         struct timeval tv;
 688
 689         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 690                 tv.tv_sec = d->bd_rtout / hz;
 691                 tv.tv_usec = (d->bd_rtout % hz) * tick;
 692
 693                 clock_interval_to_deadline(
 694                         (uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec,
 695                         NSEC_PER_USEC, &deadline);
 696                 /*
 697                  * The state is BPF_IDLE, so the timer hasn't
 698                  * been started yet, and hasn't gone off yet;
 699                  * there is no thread call scheduled, so this
 700                  * won't change the schedule.
 701                  *
 702                  * XXX - what if, by the time it gets entered,
 703                  * the deadline has already passed?
 704                  */
 705                 thread_call_enter_delayed(d->bd_thread_call, deadline);
 706                 d->bd_state = BPF_WAITING;
 707         }
 708 }
 709
 710 /*
 711  * Cancel asynchronous timer.
 712  * Must be called with bpf_mlock held.
 713  */
 714 static boolean_t
 715 bpf_stop_timer(struct bpf_d *d)
 716 {
 717         /*
 718          * If the timer has already gone off, this does nothing.
 719          * Our caller is expected to set d->bd_state to BPF_IDLE,
 720          * with the bpf_mlock, after we are called. bpf_timed_out()
 721          * also grabs bpf_mlock, so, if the timer has gone off and
 722          * bpf_timed_out() hasn't finished, it's waiting for the
 723          * lock; when this thread releases the lock, it will
 724          * find the state is BPF_IDLE, and just release the
 725          * lock and return.
 726          */
 727         return thread_call_cancel(d->bd_thread_call);
 728 }
 729
 730 void
 731 bpf_acquire_d(struct bpf_d *d)
 732 {
 733         void *lr_saved =  __builtin_return_address(0);
 734
 735         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
 736
 737         d->bd_refcnt += 1;
 738
 739         d->bd_ref_lr[d->bd_next_ref_lr] = lr_saved;
 740         d->bd_next_ref_lr = (d->bd_next_ref_lr + 1) % BPF_REF_HIST;
 741 }
 742
 743 void
 744 bpf_release_d(struct bpf_d *d)
 745 {
 746         void *lr_saved =  __builtin_return_address(0);
 747
 748         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
 749
 750         if (d->bd_refcnt <= 0) {
 751                 panic("%s: %p refcnt <= 0", __func__, d);
 752         }
 753
 754         d->bd_refcnt -= 1;
 755
 756         d->bd_unref_lr[d->bd_next_unref_lr] = lr_saved;
 757         d->bd_next_unref_lr = (d->bd_next_unref_lr + 1) % BPF_REF_HIST;
 758
 759         if (d->bd_refcnt == 0) {
 760                 /* Assert the device is detached */
 761                 if ((d->bd_flags & BPF_DETACHED) == 0) {
 762                         panic("%s: %p BPF_DETACHED not set", __func__, d);
 763                 }
 764
 765                 _FREE(d, M_DEVBUF);
 766         }
 767 }
 768
 769 /*
 770  * Open ethernet device.  Returns ENXIO for illegal minor device number,
 771  * EBUSY if file is open by another process.
 772  */
 773 /* ARGSUSED */
 774 int
 775 bpfopen(dev_t dev, int flags, __unused int fmt,
 776     struct proc *p)
 777 {
 778         struct bpf_d *d;
 779
 780         lck_mtx_lock(bpf_mlock);
 781         if ((unsigned int) minor(dev) >= nbpfilter) {
 782                 lck_mtx_unlock(bpf_mlock);
 783                 return ENXIO;
 784         }
 785         /*
 786          * New device nodes are created on demand when opening the last one.
 787          * The programming model is for processes to loop on the minor starting
 788          * at 0 as long as EBUSY is returned. The loop stops when either the
 789          * open succeeds or an error other that EBUSY is returned. That means
 790          * that bpf_make_dev_t() must block all processes that are opening the
 791          * last  node. If not all processes are blocked, they could unexpectedly
 792          * get ENOENT and abort their opening loop.
 793          */
 794         if ((unsigned int) minor(dev) == (nbpfilter - 1)) {
 795                 bpf_make_dev_t(major(dev));
 796         }
 797
 798         /*
 799          * Each minor can be opened by only one process.  If the requested
 800          * minor is in use, return EBUSY.
 801          *
 802          * Important: bpfopen() and bpfclose() have to check and set the status
 803          * of a device in the same lockin context otherwise the device may be
 804          * leaked because the vnode use count will be unpextectly greater than 1
 805          * when close() is called.
 806          */
 807         if (bpf_dtab[minor(dev)] == NULL) {
 808                 /* Reserve while opening */
 809                 bpf_dtab[minor(dev)] = BPF_DEV_RESERVED;
 810         } else {
 811                 lck_mtx_unlock(bpf_mlock);
 812                 return EBUSY;
 813         }
 814         d = (struct bpf_d *)_MALLOC(sizeof(struct bpf_d), M_DEVBUF,
 815             M_WAIT | M_ZERO);
 816         if (d == NULL) {
 817                 /* this really is a catastrophic failure */
 818                 printf("bpfopen: malloc bpf_d failed\n");
 819                 bpf_dtab[minor(dev)] = NULL;
 820                 lck_mtx_unlock(bpf_mlock);
 821                 return ENOMEM;
 822         }
 823
 824         /* Mark "in use" and do most initialization. */
 825         bpf_acquire_d(d);
 826         d->bd_bufsize = bpf_bufsize;
 827         d->bd_sig = SIGIO;
 828         d->bd_seesent = 1;
 829         d->bd_oflags = flags;
 830         d->bd_state = BPF_IDLE;
 831         d->bd_traffic_class = SO_TC_BE;
 832         d->bd_flags |= BPF_DETACHED;
 833         if (bpf_wantpktap) {
 834                 d->bd_flags |= BPF_WANT_PKTAP;
 835         } else {
 836                 d->bd_flags &= ~BPF_WANT_PKTAP;
 837         }
 838         d->bd_thread_call = thread_call_allocate(bpf_timed_out, d);
 839         if (d->bd_thread_call == NULL) {
 840                 printf("bpfopen: malloc thread call failed\n");
 841                 bpf_dtab[minor(dev)] = NULL;
 842                 bpf_release_d(d);
 843                 lck_mtx_unlock(bpf_mlock);
 844
 845                 return ENOMEM;
 846         }
 847         d->bd_opened_by = p;
 848         uuid_generate(d->bd_uuid);
 849
 850         bpf_dtab[minor(dev)] = d; /* Mark opened */
 851         lck_mtx_unlock(bpf_mlock);
 852
 853         return 0;
 854 }
 855
 856 /*
 857  * Close the descriptor by detaching it from its interface,
 858  * deallocating its buffers, and marking it free.
 859  */
 860 /* ARGSUSED */
 861 int
 862 bpfclose(dev_t dev, __unused int flags, __unused int fmt,
 863     __unused struct proc *p)
 864 {
 865         struct bpf_d *d;
 866
 867         /* Take BPF lock to ensure no other thread is using the device */
 868         lck_mtx_lock(bpf_mlock);
 869
 870         d = bpf_dtab[minor(dev)];
 871         if (d == NULL || d == BPF_DEV_RESERVED) {
 872                 lck_mtx_unlock(bpf_mlock);
 873                 return ENXIO;
 874         }
 875
 876         /*
 877          * Other threads may call bpd_detachd() if we drop the bpf_mlock
 878          */
 879         d->bd_flags |= BPF_CLOSING;
 880
 881         if (bpf_debug != 0) {
 882                 printf("%s: %llx\n",
 883                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
 884         }
 885
 886         bpf_dtab[minor(dev)] = BPF_DEV_RESERVED; /* Reserve while closing */
 887
 888         /*
 889          * Deal with any in-progress timeouts.
 890          */
 891         switch (d->bd_state) {
 892         case BPF_IDLE:
 893                 /*
 894                  * Not waiting for a timeout, and no timeout happened.
 895                  */
 896                 break;
 897
 898         case BPF_WAITING:
 899                 /*
 900                  * Waiting for a timeout.
 901                  * Cancel any timer that has yet to go off,
 902                  * and mark the state as "closing".
 903                  * Then drop the lock to allow any timers that
 904                  * *have* gone off to run to completion, and wait
 905                  * for them to finish.
 906                  */
 907                 if (!bpf_stop_timer(d)) {
 908                         /*
 909                          * There was no pending call, so the call must
 910                          * have been in progress. Wait for the call to
 911                          * complete; we have to drop the lock while
 912                          * waiting. to let the in-progrss call complete
 913                          */
 914                         d->bd_state = BPF_DRAINING;
 915                         while (d->bd_state == BPF_DRAINING) {
 916                                 msleep((caddr_t)d, bpf_mlock, PRINET,
 917                                     "bpfdraining", NULL);
 918                         }
 919                 }
 920                 d->bd_state = BPF_IDLE;
 921                 break;
 922
 923         case BPF_TIMED_OUT:
 924                 /*
 925                  * Timer went off, and the timeout routine finished.
 926                  */
 927                 d->bd_state = BPF_IDLE;
 928                 break;
 929
 930         case BPF_DRAINING:
 931                 /*
 932                  * Another thread is blocked on a close waiting for
 933                  * a timeout to finish.
 934                  * This "shouldn't happen", as the first thread to enter
 935                  * bpfclose() will set bpf_dtab[minor(dev)] to 1, and
 936                  * all subsequent threads should see that and fail with
 937                  * ENXIO.
 938                  */
 939                 panic("Two threads blocked in a BPF close");
 940                 break;
 941         }
 942
 943         if (d->bd_bif) {
 944                 bpf_detachd(d, 1);
 945         }
 946         selthreadclear(&d->bd_sel);
 947         thread_call_free(d->bd_thread_call);
 948
 949         while (d->bd_hbuf_read != 0) {
 950                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 951         }
 952
 953         bpf_freed(d);
 954
 955         /* Mark free in same context as bpfopen comes to check */
 956         bpf_dtab[minor(dev)] = NULL;                    /* Mark closed */
 957
 958         bpf_release_d(d);
 959
 960         lck_mtx_unlock(bpf_mlock);
 961
 962         return 0;
 963 }
 964
 965 #define BPF_SLEEP bpf_sleep
 966
 967 static int
 968 bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo)
 969 {
 970         u_int64_t abstime = 0;
 971
 972         if (timo != 0) {
 973                 clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime);
 974         }
 975
 976         return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime);
 977 }
 978
 979 static void
 980 bpf_finalize_pktap(struct bpf_hdr *hp, struct pktap_header *pktaphdr)
 981 {
 982         if (pktaphdr->pth_flags & PTH_FLAG_V2_HDR) {
 983                 struct pktap_v2_hdr *pktap_v2_hdr;
 984
 985                 pktap_v2_hdr = (struct pktap_v2_hdr *)pktaphdr;
 986
 987                 if (pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP) {
 988                         pktap_v2_finalize_proc_info(pktap_v2_hdr);
 989                 }
 990         } else {
 991                 if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP) {
 992                         pktap_finalize_proc_info(pktaphdr);
 993                 }
 994
 995                 if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) {
 996                         hp->bh_tstamp.tv_sec = pktaphdr->pth_tstamp.tv_sec;
 997                         hp->bh_tstamp.tv_usec = pktaphdr->pth_tstamp.tv_usec;
 998                 }
 999         }
1000 }
1001
1002 /*
1003  * Rotate the packet buffers in descriptor d.  Move the store buffer
1004  * into the hold slot, and the free buffer into the store slot.
1005  * Zero the length of the new store buffer.
1006  */
1007 #define ROTATE_BUFFERS(d) \
1008         if (d->bd_hbuf_read != 0) \
1009                 panic("rotating bpf buffers during read"); \
1010         (d)->bd_hbuf = (d)->bd_sbuf; \
1011         (d)->bd_hlen = (d)->bd_slen; \
1012         (d)->bd_hcnt = (d)->bd_scnt; \
1013         (d)->bd_sbuf = (d)->bd_fbuf; \
1014         (d)->bd_slen = 0; \
1015         (d)->bd_scnt = 0; \
1016         (d)->bd_fbuf = NULL;
1017 /*
1018  *  bpfread - read next chunk of packets from buffers
1019  */
1020 int
1021 bpfread(dev_t dev, struct uio *uio, int ioflag)
1022 {
1023         struct bpf_d *d;
1024         caddr_t hbuf;
1025         int timed_out, hbuf_len;
1026         int error;
1027         int flags;
1028
1029         lck_mtx_lock(bpf_mlock);
1030
1031         d = bpf_dtab[minor(dev)];
1032         if (d == NULL || d == BPF_DEV_RESERVED ||
1033             (d->bd_flags & BPF_CLOSING) != 0) {
1034                 lck_mtx_unlock(bpf_mlock);
1035                 return ENXIO;
1036         }
1037
1038         bpf_acquire_d(d);
1039
1040         /*
1041          * Restrict application to use a buffer the same size as
1042          * as kernel buffers.
1043          */
1044         if (uio_resid(uio) != d->bd_bufsize) {
1045                 bpf_release_d(d);
1046                 lck_mtx_unlock(bpf_mlock);
1047                 return EINVAL;
1048         }
1049
1050         if (d->bd_state == BPF_WAITING) {
1051                 bpf_stop_timer(d);
1052         }
1053
1054         timed_out = (d->bd_state == BPF_TIMED_OUT);
1055         d->bd_state = BPF_IDLE;
1056
1057         while (d->bd_hbuf_read != 0) {
1058                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
1059         }
1060
1061         if ((d->bd_flags & BPF_CLOSING) != 0) {
1062                 bpf_release_d(d);
1063                 lck_mtx_unlock(bpf_mlock);
1064                 return ENXIO;
1065         }
1066         /*
1067          * If the hold buffer is empty, then do a timed sleep, which
1068          * ends when the timeout expires or when enough packets
1069          * have arrived to fill the store buffer.
1070          */
1071         while (d->bd_hbuf == 0) {
1072                 if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) &&
1073                     d->bd_slen != 0) {
1074                         /*
1075                          * We're in immediate mode, or are reading
1076                          * in non-blocking mode, or a timer was
1077                          * started before the read (e.g., by select()
1078                          * or poll()) and has expired and a packet(s)
1079                          * either arrived since the previous
1080                          * read or arrived while we were asleep.
1081                          * Rotate the buffers and return what's here.
1082                          */
1083                         ROTATE_BUFFERS(d);
1084                         break;
1085                 }
1086
1087                 /*
1088                  * No data is available, check to see if the bpf device
1089                  * is still pointed at a real interface.  If not, return
1090                  * ENXIO so that the userland process knows to rebind
1091                  * it before using it again.
1092                  */
1093                 if (d->bd_bif == NULL) {
1094                         bpf_release_d(d);
1095                         lck_mtx_unlock(bpf_mlock);
1096                         return ENXIO;
1097                 }
1098                 if (ioflag & IO_NDELAY) {
1099                         bpf_release_d(d);
1100                         lck_mtx_unlock(bpf_mlock);
1101                         return EWOULDBLOCK;
1102                 }
1103                 error = BPF_SLEEP(d, PRINET | PCATCH, "bpf", d->bd_rtout);
1104                 /*
1105                  * Make sure device is still opened
1106                  */
1107                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1108                         bpf_release_d(d);
1109                         lck_mtx_unlock(bpf_mlock);
1110                         return ENXIO;
1111                 }
1112
1113                 while (d->bd_hbuf_read != 0) {
1114                         msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
1115                             NULL);
1116                 }
1117
1118                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1119                         bpf_release_d(d);
1120                         lck_mtx_unlock(bpf_mlock);
1121                         return ENXIO;
1122                 }
1123
1124                 if (error == EINTR || error == ERESTART) {
1125                         if (d->bd_hbuf != NULL) {
1126                                 /*
1127                                  * Because we msleep, the hold buffer might
1128                                  * be filled when we wake up.  Avoid rotating
1129                                  * in this case.
1130                                  */
1131                                 break;
1132                         }
1133                         if (d->bd_slen != 0) {
1134                                 /*
1135                                  * Sometimes we may be interrupted often and
1136                                  * the sleep above will not timeout.
1137                                  * Regardless, we should rotate the buffers
1138                                  * if there's any new data pending and
1139                                  * return it.
1140                                  */
1141                                 ROTATE_BUFFERS(d);
1142                                 break;
1143                         }
1144                         bpf_release_d(d);
1145                         lck_mtx_unlock(bpf_mlock);
1146                         if (error == ERESTART) {
1147                                 printf("%s: %llx ERESTART to EINTR\n",
1148                                     __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
1149                                 error = EINTR;
1150                         }
1151                         return error;
1152                 }
1153                 if (error == EWOULDBLOCK) {
1154                         /*
1155                          * On a timeout, return what's in the buffer,
1156                          * which may be nothing.  If there is something
1157                          * in the store buffer, we can rotate the buffers.
1158                          */
1159                         if (d->bd_hbuf) {
1160                                 /*
1161                                  * We filled up the buffer in between
1162                                  * getting the timeout and arriving
1163                                  * here, so we don't need to rotate.
1164                                  */
1165                                 break;
1166                         }
1167
1168                         if (d->bd_slen == 0) {
1169                                 bpf_release_d(d);
1170                                 lck_mtx_unlock(bpf_mlock);
1171                                 return 0;
1172                         }
1173                         ROTATE_BUFFERS(d);
1174                         break;
1175                 }
1176         }
1177         /*
1178          * At this point, we know we have something in the hold slot.
1179          */
1180
1181         /*
1182          * Set the hold buffer read. So we do not
1183          * rotate the buffers until the hold buffer
1184          * read is complete. Also to avoid issues resulting
1185          * from page faults during disk sleep (<rdar://problem/13436396>).
1186          */
1187         d->bd_hbuf_read = 1;
1188         hbuf = d->bd_hbuf;
1189         hbuf_len = d->bd_hlen;
1190         flags = d->bd_flags;
1191         lck_mtx_unlock(bpf_mlock);
1192
1193 #ifdef __APPLE__
1194         /*
1195          * Before we move data to userland, we fill out the extended
1196          * header fields.
1197          */
1198         if (flags & BPF_EXTENDED_HDR) {
1199                 char *p;
1200
1201                 p = hbuf;
1202                 while (p < hbuf + hbuf_len) {
1203                         struct bpf_hdr_ext *ehp;
1204                         uint32_t flowid;
1205                         struct so_procinfo soprocinfo;
1206                         int found = 0;
1207
1208                         ehp = (struct bpf_hdr_ext *)(void *)p;
1209                         if ((flowid = ehp->bh_flowid) != 0) {
1210                                 if (ehp->bh_proto == IPPROTO_TCP) {
1211                                         found = inp_findinpcb_procinfo(&tcbinfo,
1212                                             flowid, &soprocinfo);
1213                                 } else if (ehp->bh_proto == IPPROTO_UDP) {
1214                                         found = inp_findinpcb_procinfo(&udbinfo,
1215                                             flowid, &soprocinfo);
1216                                 }
1217                                 if (found == 1) {
1218                                         ehp->bh_pid = soprocinfo.spi_pid;
1219                                         strlcpy(&ehp->bh_comm[0], &soprocinfo.spi_proc_name[0], sizeof(ehp->bh_comm));
1220                                 }
1221                                 ehp->bh_flowid = 0;
1222                         }
1223
1224                         if (flags & BPF_FINALIZE_PKTAP) {
1225                                 struct pktap_header *pktaphdr;
1226
1227                                 pktaphdr = (struct pktap_header *)(void *)
1228                                     (p + BPF_WORDALIGN(ehp->bh_hdrlen));
1229
1230                                 bpf_finalize_pktap((struct bpf_hdr *) ehp,
1231                                     pktaphdr);
1232                         }
1233                         p += BPF_WORDALIGN(ehp->bh_hdrlen + ehp->bh_caplen);
1234                 }
1235         } else if (flags & BPF_FINALIZE_PKTAP) {
1236                 char *p;
1237
1238                 p = hbuf;
1239                 while (p < hbuf + hbuf_len) {
1240                         struct bpf_hdr *hp;
1241                         struct pktap_header *pktaphdr;
1242
1243                         hp = (struct bpf_hdr *)(void *)p;
1244                         pktaphdr = (struct pktap_header *)(void *)
1245                             (p + BPF_WORDALIGN(hp->bh_hdrlen));
1246
1247                         bpf_finalize_pktap(hp, pktaphdr);
1248
1249                         p += BPF_WORDALIGN(hp->bh_hdrlen + hp->bh_caplen);
1250                 }
1251         }
1252 #endif
1253
1254         /*
1255          * Move data from hold buffer into user space.
1256          * We know the entire buffer is transferred since
1257          * we checked above that the read buffer is bpf_bufsize bytes.
1258          */
1259         error = UIOMOVE(hbuf, hbuf_len, UIO_READ, uio);
1260
1261         lck_mtx_lock(bpf_mlock);
1262         /*
1263          * Make sure device is still opened
1264          */
1265         if ((d->bd_flags & BPF_CLOSING) != 0) {
1266                 bpf_release_d(d);
1267                 lck_mtx_unlock(bpf_mlock);
1268                 return ENXIO;
1269         }
1270
1271         d->bd_hbuf_read = 0;
1272         d->bd_fbuf = d->bd_hbuf;
1273         d->bd_hbuf = NULL;
1274         d->bd_hlen = 0;
1275         d->bd_hcnt = 0;
1276         wakeup((caddr_t)d);
1277
1278         bpf_release_d(d);
1279         lck_mtx_unlock(bpf_mlock);
1280         return error;
1281 }
1282
1283 /*
1284  * If there are processes sleeping on this descriptor, wake them up.
1285  */
1286 static void
1287 bpf_wakeup(struct bpf_d *d)
1288 {
1289         if (d->bd_state == BPF_WAITING) {
1290                 bpf_stop_timer(d);
1291                 d->bd_state = BPF_IDLE;
1292         }
1293         wakeup((caddr_t)d);
1294         if (d->bd_async && d->bd_sig && d->bd_sigio) {
1295                 pgsigio(d->bd_sigio, d->bd_sig);
1296         }
1297
1298         selwakeup(&d->bd_sel);
1299         if ((d->bd_flags & BPF_KNOTE)) {
1300                 KNOTE(&d->bd_sel.si_note, 1);
1301         }
1302 }
1303
1304 static void
1305 bpf_timed_out(void *arg, __unused void *dummy)
1306 {
1307         struct bpf_d *d = (struct bpf_d *)arg;
1308
1309         lck_mtx_lock(bpf_mlock);
1310         if (d->bd_state == BPF_WAITING) {
1311                 /*
1312                  * There's a select or kqueue waiting for this; if there's
1313                  * now stuff to read, wake it up.
1314                  */
1315                 d->bd_state = BPF_TIMED_OUT;
1316                 if (d->bd_slen != 0) {
1317                         bpf_wakeup(d);
1318                 }
1319         } else if (d->bd_state == BPF_DRAINING) {
1320                 /*
1321                  * A close is waiting for this to finish.
1322                  * Mark it as finished, and wake the close up.
1323                  */
1324                 d->bd_state = BPF_IDLE;
1325                 bpf_wakeup(d);
1326         }
1327         lck_mtx_unlock(bpf_mlock);
1328 }
1329
1330 /* keep in sync with bpf_movein above: */
1331 #define MAX_DATALINK_HDR_LEN    (sizeof(struct firewire_header))
1332
1333 int
1334 bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
1335 {
1336         struct bpf_d *d;
1337         struct ifnet *ifp;
1338         struct mbuf *m = NULL;
1339         int error;
1340         char              dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN];
1341         int datlen = 0;
1342         int bif_dlt;
1343         int bd_hdrcmplt;
1344
1345         lck_mtx_lock(bpf_mlock);
1346
1347         d = bpf_dtab[minor(dev)];
1348         if (d == NULL || d == BPF_DEV_RESERVED ||
1349             (d->bd_flags & BPF_CLOSING) != 0) {
1350                 lck_mtx_unlock(bpf_mlock);
1351                 return ENXIO;
1352         }
1353
1354         bpf_acquire_d(d);
1355
1356         if (d->bd_bif == 0) {
1357                 bpf_release_d(d);
1358                 lck_mtx_unlock(bpf_mlock);
1359                 return ENXIO;
1360         }
1361
1362         ifp = d->bd_bif->bif_ifp;
1363
1364         if ((ifp->if_flags & IFF_UP) == 0) {
1365                 bpf_release_d(d);
1366                 lck_mtx_unlock(bpf_mlock);
1367                 return ENETDOWN;
1368         }
1369         if (uio_resid(uio) == 0) {
1370                 bpf_release_d(d);
1371                 lck_mtx_unlock(bpf_mlock);
1372                 return 0;
1373         }
1374         ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf);
1375
1376         /*
1377          * fix for PR-6849527
1378          * geting variables onto stack before dropping lock for bpf_movein()
1379          */
1380         bif_dlt = (int)d->bd_bif->bif_dlt;
1381         bd_hdrcmplt  = d->bd_hdrcmplt;
1382
1383         /* bpf_movein allocating mbufs; drop lock */
1384         lck_mtx_unlock(bpf_mlock);
1385
1386         error = bpf_movein(uio, bif_dlt, &m,
1387             bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf,
1388             &datlen);
1389
1390         /* take the lock again */
1391         lck_mtx_lock(bpf_mlock);
1392         if (error) {
1393                 bpf_release_d(d);
1394                 lck_mtx_unlock(bpf_mlock);
1395                 return error;
1396         }
1397
1398         /* verify the device is still open */
1399         if ((d->bd_flags & BPF_CLOSING) != 0) {
1400                 bpf_release_d(d);
1401                 lck_mtx_unlock(bpf_mlock);
1402                 m_freem(m);
1403                 return ENXIO;
1404         }
1405
1406         if (d->bd_bif == NULL) {
1407                 bpf_release_d(d);
1408                 lck_mtx_unlock(bpf_mlock);
1409                 m_free(m);
1410                 return ENXIO;
1411         }
1412
1413         if ((unsigned)datlen > ifp->if_mtu) {
1414                 bpf_release_d(d);
1415                 lck_mtx_unlock(bpf_mlock);
1416                 m_freem(m);
1417                 return EMSGSIZE;
1418         }
1419
1420         bpf_set_packet_service_class(m, d->bd_traffic_class);
1421
1422         lck_mtx_unlock(bpf_mlock);
1423
1424         /*
1425          * The driver frees the mbuf.
1426          */
1427         if (d->bd_hdrcmplt) {
1428                 if (d->bd_bif->bif_send) {
1429                         error = d->bd_bif->bif_send(ifp, d->bd_bif->bif_dlt, m);
1430                 } else {
1431                         error = dlil_output(ifp, 0, m, NULL, NULL, 1, NULL);
1432                 }
1433         } else {
1434                 error = dlil_output(ifp, PF_INET, m, NULL,
1435                     (struct sockaddr *)dst_buf, 0, NULL);
1436         }
1437
1438         lck_mtx_lock(bpf_mlock);
1439         bpf_release_d(d);
1440         lck_mtx_unlock(bpf_mlock);
1441
1442         return error;
1443 }
1444
1445 /*
1446  * Reset a descriptor by flushing its packet buffer and clearing the
1447  * receive and drop counts.
1448  */
1449 static void
1450 reset_d(struct bpf_d *d)
1451 {
1452         if (d->bd_hbuf_read != 0) {
1453                 panic("resetting buffers during read");
1454         }
1455
1456         if (d->bd_hbuf) {
1457                 /* Free the hold buffer. */
1458                 d->bd_fbuf = d->bd_hbuf;
1459                 d->bd_hbuf = NULL;
1460         }
1461         d->bd_slen = 0;
1462         d->bd_hlen = 0;
1463         d->bd_scnt = 0;
1464         d->bd_hcnt = 0;
1465         d->bd_rcount = 0;
1466         d->bd_dcount = 0;
1467 }
1468
1469 static struct bpf_d *
1470 bpf_get_device_from_uuid(uuid_t uuid)
1471 {
1472         unsigned int i;
1473
1474         for (i = 0; i < nbpfilter; i++) {
1475                 struct bpf_d *d = bpf_dtab[i];
1476
1477                 if (d == NULL || d == BPF_DEV_RESERVED ||
1478                     (d->bd_flags & BPF_CLOSING) != 0) {
1479                         continue;
1480                 }
1481                 if (uuid_compare(uuid, d->bd_uuid) == 0) {
1482                         return d;
1483                 }
1484         }
1485
1486         return NULL;
1487 }
1488
1489 /*
1490  * The BIOCSETUP command "atomically" attach to the interface and
1491  * copy the buffer from another interface. This minimizes the risk
1492  * of missing packet because this is done while holding
1493  * the BPF global lock
1494  */
1495 static int
1496 bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp)
1497 {
1498         struct bpf_d *d_from;
1499         int error = 0;
1500
1501         LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
1502
1503         /*
1504          * Sanity checks
1505          */
1506         d_from = bpf_get_device_from_uuid(uuid_from);
1507         if (d_from == NULL) {
1508                 error = ENOENT;
1509                 os_log_info(OS_LOG_DEFAULT,
1510                     "%s: uuids not found error %d",
1511                     __func__, error);
1512                 return error;
1513         }
1514         if (d_from->bd_opened_by != d_to->bd_opened_by) {
1515                 error = EACCES;
1516                 os_log_info(OS_LOG_DEFAULT,
1517                     "%s: processes not matching error %d",
1518                     __func__, error);
1519                 return error;
1520         }
1521
1522         /*
1523          * Prevent any read while copying
1524          */
1525         while (d_to->bd_hbuf_read != 0) {
1526                 msleep((caddr_t)d_to, bpf_mlock, PRINET, __func__, NULL);
1527         }
1528         d_to->bd_hbuf_read = 1;
1529
1530         while (d_from->bd_hbuf_read != 0) {
1531                 msleep((caddr_t)d_from, bpf_mlock, PRINET, __func__, NULL);
1532         }
1533         d_from->bd_hbuf_read = 1;
1534
1535         /*
1536          * Verify the devices have not been closed
1537          */
1538         if (d_to->bd_flags & BPF_CLOSING) {
1539                 error = ENXIO;
1540                 os_log_info(OS_LOG_DEFAULT,
1541                     "%s: d_to is closing error %d",
1542                     __func__, error);
1543                 goto done;
1544         }
1545         if (d_from->bd_flags & BPF_CLOSING) {
1546                 error = ENXIO;
1547                 os_log_info(OS_LOG_DEFAULT,
1548                     "%s: d_from is closing error %d",
1549                     __func__, error);
1550                 goto done;
1551         }
1552
1553         /*
1554          * For now require the same buffer size
1555          */
1556         if (d_from->bd_bufsize != d_to->bd_bufsize) {
1557                 error = EINVAL;
1558                 os_log_info(OS_LOG_DEFAULT,
1559                     "%s: bufsizes not matching error %d",
1560                     __func__, error);
1561                 goto done;
1562         }
1563
1564         /*
1565          * Attach to the interface
1566          */
1567         error = bpf_setif(d_to, ifp, false, true);
1568         if (error != 0) {
1569                 os_log_info(OS_LOG_DEFAULT,
1570                     "%s: bpf_setif() failed error %d",
1571                     __func__, error);
1572                 goto done;
1573         }
1574
1575         /*
1576          * Make sure the buffers are setup as expected by bpf_setif()
1577          */
1578         ASSERT(d_to->bd_hbuf == NULL);
1579         ASSERT(d_to->bd_sbuf != NULL);
1580         ASSERT(d_to->bd_fbuf != NULL);
1581
1582         /*
1583          * Copy the buffers and update the pointers and counts
1584          */
1585         memcpy(d_to->bd_sbuf, d_from->bd_sbuf, d_from->bd_slen);
1586         d_to->bd_slen = d_from->bd_slen;
1587         d_to->bd_scnt = d_from->bd_scnt;
1588
1589         if (d_from->bd_hbuf != NULL) {
1590                 d_to->bd_hbuf = d_to->bd_fbuf;
1591                 d_to->bd_fbuf = NULL;
1592                 memcpy(d_to->bd_hbuf, d_from->bd_hbuf, d_from->bd_hlen);
1593         }
1594         d_to->bd_hlen = d_from->bd_hlen;
1595         d_to->bd_hcnt = d_from->bd_hcnt;
1596
1597         if (bpf_debug > 0) {
1598                 os_log_info(OS_LOG_DEFAULT,
1599                     "%s: done slen %u scnt %u hlen %u hcnt %u",
1600                     __func__, d_to->bd_slen, d_to->bd_scnt,
1601                     d_to->bd_hlen, d_to->bd_hcnt);
1602         }
1603 done:
1604         d_from->bd_hbuf_read = 0;
1605         wakeup((caddr_t)d_from);
1606
1607         d_to->bd_hbuf_read = 0;
1608         wakeup((caddr_t)d_to);
1609
1610         return error;
1611 }
1612
1613 /*
1614  *  FIONREAD            Check for read packet available.
1615  *  SIOCGIFADDR         Get interface address - convenient hook to driver.
1616  *  BIOCGBLEN           Get buffer len [for read()].
1617  *  BIOCSETF            Set ethernet read filter.
1618  *  BIOCFLUSH           Flush read packet buffer.
1619  *  BIOCPROMISC         Put interface into promiscuous mode.
1620  *  BIOCGDLT            Get link layer type.
1621  *  BIOCGETIF           Get interface name.
1622  *  BIOCSETIF           Set interface.
1623  *  BIOCSRTIMEOUT       Set read timeout.
1624  *  BIOCGRTIMEOUT       Get read timeout.
1625  *  BIOCGSTATS          Get packet stats.
1626  *  BIOCIMMEDIATE       Set immediate mode.
1627  *  BIOCVERSION         Get filter language version.
1628  *  BIOCGHDRCMPLT       Get "header already complete" flag
1629  *  BIOCSHDRCMPLT       Set "header already complete" flag
1630  *  BIOCGSEESENT        Get "see packets sent" flag
1631  *  BIOCSSEESENT        Set "see packets sent" flag
1632  *  BIOCSETTC           Set traffic class.
1633  *  BIOCGETTC           Get traffic class.
1634  *  BIOCSEXTHDR         Set "extended header" flag
1635  *  BIOCSHEADDROP       Drop head of the buffer if user is not reading
1636  *  BIOCGHEADDROP       Get "head-drop" flag
1637  */
1638 /* ARGSUSED */
1639 int
1640 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
1641     struct proc *p)
1642 {
1643         struct bpf_d *d;
1644         int error = 0;
1645         u_int int_arg;
1646         struct ifreq ifr;
1647
1648         lck_mtx_lock(bpf_mlock);
1649
1650         d = bpf_dtab[minor(dev)];
1651         if (d == NULL || d == BPF_DEV_RESERVED ||
1652             (d->bd_flags & BPF_CLOSING) != 0) {
1653                 lck_mtx_unlock(bpf_mlock);
1654                 return ENXIO;
1655         }
1656
1657         bpf_acquire_d(d);
1658
1659         if (d->bd_state == BPF_WAITING) {
1660                 bpf_stop_timer(d);
1661         }
1662         d->bd_state = BPF_IDLE;
1663
1664         switch (cmd) {
1665         default:
1666                 error = EINVAL;
1667                 break;
1668
1669         /*
1670          * Check for read packet available.
1671          */
1672         case FIONREAD:                  /* int */
1673         {
1674                 int n;
1675
1676                 n = d->bd_slen;
1677                 if (d->bd_hbuf && d->bd_hbuf_read == 0) {
1678                         n += d->bd_hlen;
1679                 }
1680
1681                 bcopy(&n, addr, sizeof(n));
1682                 break;
1683         }
1684
1685         case SIOCGIFADDR:               /* struct ifreq */
1686         {
1687                 struct ifnet *ifp;
1688
1689                 if (d->bd_bif == 0) {
1690                         error = EINVAL;
1691                 } else {
1692                         ifp = d->bd_bif->bif_ifp;
1693                         error = ifnet_ioctl(ifp, 0, cmd, addr);
1694                 }
1695                 break;
1696         }
1697
1698         /*
1699          * Get buffer len [for read()].
1700          */
1701         case BIOCGBLEN:                 /* u_int */
1702                 bcopy(&d->bd_bufsize, addr, sizeof(u_int));
1703                 break;
1704
1705         /*
1706          * Set buffer length.
1707          */
1708         case BIOCSBLEN: {               /* u_int */
1709                 u_int size;
1710                 unsigned int maxbufsize = bpf_maxbufsize;
1711
1712                 /*
1713                  * Allow larger buffer in head drop mode to with the
1714                  * assumption the reading process may be low priority but
1715                  * is interested in the most recent traffic
1716                  */
1717                 if (d->bd_headdrop != 0) {
1718                         maxbufsize = 2 * bpf_maxbufsize;
1719                 }
1720
1721                 if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING)) {
1722                         /*
1723                          * Interface already attached, unable to change buffers
1724                          */
1725                         error = EINVAL;
1726                         break;
1727                 }
1728                 bcopy(addr, &size, sizeof(size));
1729
1730                 if (size > maxbufsize) {
1731                         d->bd_bufsize = maxbufsize;
1732
1733                         os_log_info(OS_LOG_DEFAULT,
1734                             "%s bufsize capped to %u from %u",
1735                             __func__, d->bd_bufsize, size);
1736                 } else if (size < BPF_MINBUFSIZE) {
1737                         d->bd_bufsize = BPF_MINBUFSIZE;
1738
1739                         os_log_info(OS_LOG_DEFAULT,
1740                             "%s bufsize bumped to %u from %u",
1741                             __func__, d->bd_bufsize, size);
1742                 } else {
1743                         d->bd_bufsize = size;
1744                 }
1745
1746                 /* It's a read/write ioctl */
1747                 bcopy(&d->bd_bufsize, addr, sizeof(u_int));
1748                 break;
1749         }
1750         /*
1751          * Set link layer read filter.
1752          */
1753         case BIOCSETF32:
1754         case BIOCSETFNR32: {            /* struct bpf_program32 */
1755                 struct bpf_program32 prg32;
1756
1757                 bcopy(addr, &prg32, sizeof(prg32));
1758                 error = bpf_setf(d, prg32.bf_len,
1759                     CAST_USER_ADDR_T(prg32.bf_insns), cmd);
1760                 break;
1761         }
1762
1763         case BIOCSETF64:
1764         case BIOCSETFNR64: {            /* struct bpf_program64 */
1765                 struct bpf_program64 prg64;
1766
1767                 bcopy(addr, &prg64, sizeof(prg64));
1768                 error = bpf_setf(d, prg64.bf_len, prg64.bf_insns, cmd);
1769                 break;
1770         }
1771
1772         /*
1773          * Flush read packet buffer.
1774          */
1775         case BIOCFLUSH:
1776                 while (d->bd_hbuf_read != 0) {
1777                         msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
1778                             NULL);
1779                 }
1780                 if ((d->bd_flags & BPF_CLOSING) != 0) {
1781                         error = ENXIO;
1782                         break;
1783                 }
1784                 reset_d(d);
1785                 break;
1786
1787         /*
1788          * Put interface into promiscuous mode.
1789          */
1790         case BIOCPROMISC:
1791                 if (d->bd_bif == 0) {
1792                         /*
1793                          * No interface attached yet.
1794                          */
1795                         error = EINVAL;
1796                         break;
1797                 }
1798                 if (d->bd_promisc == 0) {
1799                         lck_mtx_unlock(bpf_mlock);
1800                         error = ifnet_set_promiscuous(d->bd_bif->bif_ifp, 1);
1801                         lck_mtx_lock(bpf_mlock);
1802                         if (error == 0) {
1803                                 d->bd_promisc = 1;
1804                         }
1805                 }
1806                 break;
1807
1808         /*
1809          * Get device parameters.
1810          */
1811         case BIOCGDLT:                  /* u_int */
1812                 if (d->bd_bif == 0) {
1813                         error = EINVAL;
1814                 } else {
1815                         bcopy(&d->bd_bif->bif_dlt, addr, sizeof(u_int));
1816                 }
1817                 break;
1818
1819         /*
1820          * Get a list of supported data link types.
1821          */
1822         case BIOCGDLTLIST:              /* struct bpf_dltlist */
1823                 if (d->bd_bif == NULL) {
1824                         error = EINVAL;
1825                 } else {
1826                         error = bpf_getdltlist(d, addr, p);
1827                 }
1828                 break;
1829
1830         /*
1831          * Set data link type.
1832          */
1833         case BIOCSDLT:                  /* u_int */
1834                 if (d->bd_bif == NULL) {
1835                         error = EINVAL;
1836                 } else {
1837                         u_int dlt;
1838
1839                         bcopy(addr, &dlt, sizeof(dlt));
1840
1841                         if (dlt == DLT_PKTAP &&
1842                             !(d->bd_flags & BPF_WANT_PKTAP)) {
1843                                 dlt = DLT_RAW;
1844                         }
1845                         error = bpf_setdlt(d, dlt);
1846                 }
1847                 break;
1848
1849         /*
1850          * Get interface name.
1851          */
1852         case BIOCGETIF:                 /* struct ifreq */
1853                 if (d->bd_bif == 0) {
1854                         error = EINVAL;
1855                 } else {
1856                         struct ifnet *const ifp = d->bd_bif->bif_ifp;
1857
1858                         snprintf(((struct ifreq *)(void *)addr)->ifr_name,
1859                             sizeof(ifr.ifr_name), "%s", if_name(ifp));
1860                 }
1861                 break;
1862
1863         /*
1864          * Set interface.
1865          */
1866         case BIOCSETIF: {               /* struct ifreq */
1867                 ifnet_t ifp;
1868
1869                 bcopy(addr, &ifr, sizeof(ifr));
1870                 ifr.ifr_name[IFNAMSIZ - 1] = '\0';
1871                 ifp = ifunit(ifr.ifr_name);
1872                 if (ifp == NULL) {
1873                         error = ENXIO;
1874                 } else {
1875                         error = bpf_setif(d, ifp, true, false);
1876                 }
1877                 break;
1878         }
1879
1880         /*
1881          * Set read timeout.
1882          */
1883         case BIOCSRTIMEOUT32: {         /* struct user32_timeval */
1884                 struct user32_timeval _tv;
1885                 struct timeval tv;
1886
1887                 bcopy(addr, &_tv, sizeof(_tv));
1888                 tv.tv_sec  = _tv.tv_sec;
1889                 tv.tv_usec = _tv.tv_usec;
1890
1891                 /*
1892                  * Subtract 1 tick from tvtohz() since this isn't
1893                  * a one-shot timer.
1894                  */
1895                 if ((error = itimerfix(&tv)) == 0) {
1896                         d->bd_rtout = tvtohz(&tv) - 1;
1897                 }
1898                 break;
1899         }
1900
1901         case BIOCSRTIMEOUT64: {         /* struct user64_timeval */
1902                 struct user64_timeval _tv;
1903                 struct timeval tv;
1904
1905                 bcopy(addr, &_tv, sizeof(_tv));
1906                 tv.tv_sec  = _tv.tv_sec;
1907                 tv.tv_usec = _tv.tv_usec;
1908
1909                 /*
1910                  * Subtract 1 tick from tvtohz() since this isn't
1911                  * a one-shot timer.
1912                  */
1913                 if ((error = itimerfix(&tv)) == 0) {
1914                         d->bd_rtout = tvtohz(&tv) - 1;
1915                 }
1916                 break;
1917         }
1918
1919         /*
1920          * Get read timeout.
1921          */
1922         case BIOCGRTIMEOUT32: {         /* struct user32_timeval */
1923                 struct user32_timeval tv;
1924
1925                 bzero(&tv, sizeof(tv));
1926                 tv.tv_sec = d->bd_rtout / hz;
1927                 tv.tv_usec = (d->bd_rtout % hz) * tick;
1928                 bcopy(&tv, addr, sizeof(tv));
1929                 break;
1930         }
1931
1932         case BIOCGRTIMEOUT64: {         /* struct user64_timeval */
1933                 struct user64_timeval tv;
1934
1935                 bzero(&tv, sizeof(tv));
1936                 tv.tv_sec = d->bd_rtout / hz;
1937                 tv.tv_usec = (d->bd_rtout % hz) * tick;
1938                 bcopy(&tv, addr, sizeof(tv));
1939                 break;
1940         }
1941
1942         /*
1943          * Get packet stats.
1944          */
1945         case BIOCGSTATS: {              /* struct bpf_stat */
1946                 struct bpf_stat bs;
1947
1948                 bzero(&bs, sizeof(bs));
1949                 bs.bs_recv = d->bd_rcount;
1950                 bs.bs_drop = d->bd_dcount;
1951                 bcopy(&bs, addr, sizeof(bs));
1952                 break;
1953         }
1954
1955         /*
1956          * Set immediate mode.
1957          */
1958         case BIOCIMMEDIATE:             /* u_int */
1959                 d->bd_immediate = *(u_int *)(void *)addr;
1960                 break;
1961
1962         case BIOCVERSION: {             /* struct bpf_version */
1963                 struct bpf_version bv;
1964
1965                 bzero(&bv, sizeof(bv));
1966                 bv.bv_major = BPF_MAJOR_VERSION;
1967                 bv.bv_minor = BPF_MINOR_VERSION;
1968                 bcopy(&bv, addr, sizeof(bv));
1969                 break;
1970         }
1971
1972         /*
1973          * Get "header already complete" flag
1974          */
1975         case BIOCGHDRCMPLT:             /* u_int */
1976                 bcopy(&d->bd_hdrcmplt, addr, sizeof(u_int));
1977                 break;
1978
1979         /*
1980          * Set "header already complete" flag
1981          */
1982         case BIOCSHDRCMPLT:             /* u_int */
1983                 bcopy(addr, &int_arg, sizeof(int_arg));
1984                 d->bd_hdrcmplt = int_arg ? 1 : 0;
1985                 break;
1986
1987         /*
1988          * Get "see sent packets" flag
1989          */
1990         case BIOCGSEESENT:              /* u_int */
1991                 bcopy(&d->bd_seesent, addr, sizeof(u_int));
1992                 break;
1993
1994         /*
1995          * Set "see sent packets" flag
1996          */
1997         case BIOCSSEESENT:              /* u_int */
1998                 bcopy(addr, &d->bd_seesent, sizeof(u_int));
1999                 break;
2000
2001         /*
2002          * Set traffic service class
2003          */
2004         case BIOCSETTC: {               /* int */
2005                 int tc;
2006
2007                 bcopy(addr, &tc, sizeof(int));
2008                 error = bpf_set_traffic_class(d, tc);
2009                 break;
2010         }
2011
2012         /*
2013          * Get traffic service class
2014          */
2015         case BIOCGETTC:                 /* int */
2016                 bcopy(&d->bd_traffic_class, addr, sizeof(int));
2017                 break;
2018
2019         case FIONBIO:           /* Non-blocking I/O; int */
2020                 break;
2021
2022         case FIOASYNC:          /* Send signal on receive packets; int */
2023                 bcopy(addr, &d->bd_async, sizeof(int));
2024                 break;
2025 #ifndef __APPLE__
2026         case FIOSETOWN:
2027                 error = fsetown(*(int *)addr, &d->bd_sigio);
2028                 break;
2029
2030         case FIOGETOWN:
2031                 *(int *)addr = fgetown(d->bd_sigio);
2032                 break;
2033
2034         /* This is deprecated, FIOSETOWN should be used instead. */
2035         case TIOCSPGRP:
2036                 error = fsetown(-(*(int *)addr), &d->bd_sigio);
2037                 break;
2038
2039         /* This is deprecated, FIOGETOWN should be used instead. */
2040         case TIOCGPGRP:
2041                 *(int *)addr = -fgetown(d->bd_sigio);
2042                 break;
2043 #endif
2044         case BIOCSRSIG: {       /* Set receive signal; u_int */
2045                 u_int sig;
2046
2047                 bcopy(addr, &sig, sizeof(u_int));
2048
2049                 if (sig >= NSIG) {
2050                         error = EINVAL;
2051                 } else {
2052                         d->bd_sig = sig;
2053                 }
2054                 break;
2055         }
2056         case BIOCGRSIG:                 /* u_int */
2057                 bcopy(&d->bd_sig, addr, sizeof(u_int));
2058                 break;
2059 #ifdef __APPLE__
2060         case BIOCSEXTHDR:               /* u_int */
2061                 bcopy(addr, &int_arg, sizeof(int_arg));
2062                 if (int_arg) {
2063                         d->bd_flags |= BPF_EXTENDED_HDR;
2064                 } else {
2065                         d->bd_flags &= ~BPF_EXTENDED_HDR;
2066                 }
2067                 break;
2068
2069         case BIOCGIFATTACHCOUNT: {              /* struct ifreq */
2070                 ifnet_t ifp;
2071                 struct bpf_if *bp;
2072
2073                 bcopy(addr, &ifr, sizeof(ifr));
2074                 ifr.ifr_name[IFNAMSIZ - 1] = '\0';
2075                 ifp = ifunit(ifr.ifr_name);
2076                 if (ifp == NULL) {
2077                         error = ENXIO;
2078                         break;
2079                 }
2080                 ifr.ifr_intval = 0;
2081                 for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) {
2082                         struct bpf_d *bpf_d;
2083
2084                         if (bp->bif_ifp == NULL || bp->bif_ifp != ifp) {
2085                                 continue;
2086                         }
2087                         for (bpf_d = bp->bif_dlist; bpf_d;
2088                             bpf_d = bpf_d->bd_next) {
2089                                 ifr.ifr_intval += 1;
2090                         }
2091                 }
2092                 bcopy(&ifr, addr, sizeof(ifr));
2093                 break;
2094         }
2095         case BIOCGWANTPKTAP:                    /* u_int */
2096                 int_arg = d->bd_flags & BPF_WANT_PKTAP ? 1 : 0;
2097                 bcopy(&int_arg, addr, sizeof(int_arg));
2098                 break;
2099
2100         case BIOCSWANTPKTAP:                    /* u_int */
2101                 bcopy(addr, &int_arg, sizeof(int_arg));
2102                 if (int_arg) {
2103                         d->bd_flags |= BPF_WANT_PKTAP;
2104                 } else {
2105                         d->bd_flags &= ~BPF_WANT_PKTAP;
2106                 }
2107                 break;
2108 #endif
2109
2110         case BIOCSHEADDROP:
2111                 bcopy(addr, &int_arg, sizeof(int_arg));
2112                 d->bd_headdrop = int_arg ? 1 : 0;
2113                 break;
2114
2115         case BIOCGHEADDROP:
2116                 bcopy(&d->bd_headdrop, addr, sizeof(int));
2117                 break;
2118
2119         case BIOCSTRUNCATE:
2120                 bcopy(addr, &int_arg, sizeof(int_arg));
2121                 if (int_arg) {
2122                         d->bd_flags |=  BPF_TRUNCATE;
2123                 } else {
2124                         d->bd_flags &= ~BPF_TRUNCATE;
2125                 }
2126                 break;
2127
2128         case BIOCGETUUID:
2129                 bcopy(&d->bd_uuid, addr, sizeof(uuid_t));
2130                 break;
2131
2132         case BIOCSETUP: {
2133                 struct bpf_setup_args bsa;
2134                 ifnet_t ifp;
2135
2136                 bcopy(addr, &bsa, sizeof(struct bpf_setup_args));
2137                 bsa.bsa_ifname[IFNAMSIZ - 1] = 0;
2138                 ifp = ifunit(bsa.bsa_ifname);
2139                 if (ifp == NULL) {
2140                         error = ENXIO;
2141                         os_log_info(OS_LOG_DEFAULT,
2142                             "%s: ifnet not found for %s error %d",
2143                             __func__, bsa.bsa_ifname, error);
2144                         break;
2145                 }
2146
2147                 error = bpf_setup(d, bsa.bsa_uuid, ifp);
2148                 break;
2149         }
2150         case BIOCSPKTHDRV2:
2151                 bcopy(addr, &int_arg, sizeof(int_arg));
2152                 if (int_arg != 0) {
2153                         d->bd_flags |= BPF_PKTHDRV2;
2154                 } else {
2155                         d->bd_flags &= ~BPF_PKTHDRV2;
2156                 }
2157                 break;
2158
2159         case BIOCGPKTHDRV2:
2160                 int_arg = d->bd_flags & BPF_PKTHDRV2 ? 1 : 0;
2161                 bcopy(&int_arg, addr, sizeof(int));
2162                 break;
2163         }
2164
2165         bpf_release_d(d);
2166         lck_mtx_unlock(bpf_mlock);
2167
2168         return error;
2169 }
2170
2171 /*
2172  * Set d's packet filter program to fp.  If this file already has a filter,
2173  * free it and replace it.  Returns EINVAL for bogus requests.
2174  */
2175 static int
2176 bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns,
2177     u_long cmd)
2178 {
2179         struct bpf_insn *fcode, *old;
2180         u_int flen, size;
2181
2182         while (d->bd_hbuf_read != 0) {
2183                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2184         }
2185
2186         if ((d->bd_flags & BPF_CLOSING) != 0) {
2187                 return ENXIO;
2188         }
2189
2190         old = d->bd_filter;
2191         if (bf_insns == USER_ADDR_NULL) {
2192                 if (bf_len != 0) {
2193                         return EINVAL;
2194                 }
2195                 d->bd_filter = NULL;
2196                 reset_d(d);
2197                 if (old != 0) {
2198                         FREE(old, M_DEVBUF);
2199                 }
2200                 return 0;
2201         }
2202         flen = bf_len;
2203         if (flen > BPF_MAXINSNS) {
2204                 return EINVAL;
2205         }
2206
2207         size = flen * sizeof(struct bpf_insn);
2208         fcode = (struct bpf_insn *) _MALLOC(size, M_DEVBUF, M_WAIT);
2209 #ifdef __APPLE__
2210         if (fcode == NULL) {
2211                 return ENOBUFS;
2212         }
2213 #endif
2214         if (copyin(bf_insns, (caddr_t)fcode, size) == 0 &&
2215             bpf_validate(fcode, (int)flen)) {
2216                 d->bd_filter = fcode;
2217
2218                 if (cmd == BIOCSETF32 || cmd == BIOCSETF64) {
2219                         reset_d(d);
2220                 }
2221
2222                 if (old != 0) {
2223                         FREE(old, M_DEVBUF);
2224                 }
2225
2226                 return 0;
2227         }
2228         FREE(fcode, M_DEVBUF);
2229         return EINVAL;
2230 }
2231
2232 /*
2233  * Detach a file from its current interface (if attached at all) and attach
2234  * to the interface indicated by the name stored in ifr.
2235  * Return an errno or 0.
2236  */
2237 static int
2238 bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read)
2239 {
2240         struct bpf_if *bp;
2241         int error;
2242
2243         while (d->bd_hbuf_read != 0 && !has_hbuf_read) {
2244                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2245         }
2246
2247         if ((d->bd_flags & BPF_CLOSING) != 0) {
2248                 return ENXIO;
2249         }
2250
2251         /*
2252          * Look through attached interfaces for the named one.
2253          */
2254         for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) {
2255                 struct ifnet *ifp = bp->bif_ifp;
2256
2257                 if (ifp == 0 || ifp != theywant) {
2258                         continue;
2259                 }
2260                 /*
2261                  * Do not use DLT_PKTAP, unless requested explicitly
2262                  */
2263                 if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
2264                         continue;
2265                 }
2266                 /*
2267                  * Skip the coprocessor interface
2268                  */
2269                 if (!intcoproc_unrestricted && IFNET_IS_INTCOPROC(ifp)) {
2270                         continue;
2271                 }
2272                 /*
2273                  * We found the requested interface.
2274                  * Allocate the packet buffers.
2275                  */
2276                 error = bpf_allocbufs(d);
2277                 if (error != 0) {
2278                         return error;
2279                 }
2280                 /*
2281                  * Detach if attached to something else.
2282                  */
2283                 if (bp != d->bd_bif) {
2284                         if (d->bd_bif != NULL) {
2285                                 if (bpf_detachd(d, 0) != 0) {
2286                                         return ENXIO;
2287                                 }
2288                         }
2289                         if (bpf_attachd(d, bp) != 0) {
2290                                 return ENXIO;
2291                         }
2292                 }
2293                 if (do_reset) {
2294                         reset_d(d);
2295                 }
2296                 return 0;
2297         }
2298         /* Not found. */
2299         return ENXIO;
2300 }
2301
2302 /*
2303  * Get a list of available data link type of the interface.
2304  */
2305 static int
2306 bpf_getdltlist(struct bpf_d *d, caddr_t addr, struct proc *p)
2307 {
2308         u_int           n;
2309         int             error;
2310         struct ifnet    *ifp;
2311         struct bpf_if   *bp;
2312         user_addr_t     dlist;
2313         struct bpf_dltlist bfl;
2314
2315         bcopy(addr, &bfl, sizeof(bfl));
2316         if (proc_is64bit(p)) {
2317                 dlist = (user_addr_t)bfl.bfl_u.bflu_pad;
2318         } else {
2319                 dlist = CAST_USER_ADDR_T(bfl.bfl_u.bflu_list);
2320         }
2321
2322         ifp = d->bd_bif->bif_ifp;
2323         n = 0;
2324         error = 0;
2325
2326         for (bp = bpf_iflist; bp; bp = bp->bif_next) {
2327                 if (bp->bif_ifp != ifp) {
2328                         continue;
2329                 }
2330                 /*
2331                  * Do not use DLT_PKTAP, unless requested explicitly
2332                  */
2333                 if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
2334                         continue;
2335                 }
2336                 if (dlist != USER_ADDR_NULL) {
2337                         if (n >= bfl.bfl_len) {
2338                                 return ENOMEM;
2339                         }
2340                         error = copyout(&bp->bif_dlt, dlist,
2341                             sizeof(bp->bif_dlt));
2342                         if (error != 0) {
2343                                 break;
2344                         }
2345                         dlist += sizeof(bp->bif_dlt);
2346                 }
2347                 n++;
2348         }
2349         bfl.bfl_len = n;
2350         bcopy(&bfl, addr, sizeof(bfl));
2351
2352         return error;
2353 }
2354
2355 /*
2356  * Set the data link type of a BPF instance.
2357  */
2358 static int
2359 bpf_setdlt(struct bpf_d *d, uint32_t dlt)
2360 {
2361         int error, opromisc;
2362         struct ifnet *ifp;
2363         struct bpf_if *bp;
2364
2365         if (d->bd_bif->bif_dlt == dlt) {
2366                 return 0;
2367         }
2368
2369         while (d->bd_hbuf_read != 0) {
2370                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2371         }
2372
2373         if ((d->bd_flags & BPF_CLOSING) != 0) {
2374                 return ENXIO;
2375         }
2376
2377         ifp = d->bd_bif->bif_ifp;
2378         for (bp = bpf_iflist; bp; bp = bp->bif_next) {
2379                 if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) {
2380                         /*
2381                          * Do not use DLT_PKTAP, unless requested explicitly
2382                          */
2383                         if (bp->bif_dlt == DLT_PKTAP &&
2384                             !(d->bd_flags & BPF_WANT_PKTAP)) {
2385                                 continue;
2386                         }
2387                         break;
2388                 }
2389         }
2390         if (bp != NULL) {
2391                 opromisc = d->bd_promisc;
2392                 if (bpf_detachd(d, 0) != 0) {
2393                         return ENXIO;
2394                 }
2395                 error = bpf_attachd(d, bp);
2396                 if (error) {
2397                         printf("bpf_setdlt: bpf_attachd %s%d failed (%d)\n",
2398                             ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp),
2399                             error);
2400                         return error;
2401                 }
2402                 reset_d(d);
2403                 if (opromisc) {
2404                         lck_mtx_unlock(bpf_mlock);
2405                         error = ifnet_set_promiscuous(bp->bif_ifp, 1);
2406                         lck_mtx_lock(bpf_mlock);
2407                         if (error) {
2408                                 printf("%s: ifpromisc %s%d failed (%d)\n",
2409                                     __func__, ifnet_name(bp->bif_ifp),
2410                                     ifnet_unit(bp->bif_ifp), error);
2411                         } else {
2412                                 d->bd_promisc = 1;
2413                         }
2414                 }
2415         }
2416         return bp == NULL ? EINVAL : 0;
2417 }
2418
2419 static int
2420 bpf_set_traffic_class(struct bpf_d *d, int tc)
2421 {
2422         int error = 0;
2423
2424         if (!SO_VALID_TC(tc)) {
2425                 error = EINVAL;
2426         } else {
2427                 d->bd_traffic_class = tc;
2428         }
2429
2430         return error;
2431 }
2432
2433 static void
2434 bpf_set_packet_service_class(struct mbuf *m, int tc)
2435 {
2436         if (!(m->m_flags & M_PKTHDR)) {
2437                 return;
2438         }
2439
2440         VERIFY(SO_VALID_TC(tc));
2441         (void) m_set_service_class(m, so_tc2msc(tc));
2442 }
2443
2444 /*
2445  * Support for select()
2446  *
2447  * Return true iff the specific operation will not block indefinitely.
2448  * Otherwise, return false but make a note that a selwakeup() must be done.
2449  */
2450 int
2451 bpfselect(dev_t dev, int which, void * wql, struct proc *p)
2452 {
2453         struct bpf_d *d;
2454         int ret = 0;
2455
2456         lck_mtx_lock(bpf_mlock);
2457
2458         d = bpf_dtab[minor(dev)];
2459         if (d == NULL || d == BPF_DEV_RESERVED ||
2460             (d->bd_flags & BPF_CLOSING) != 0) {
2461                 lck_mtx_unlock(bpf_mlock);
2462                 return ENXIO;
2463         }
2464
2465         bpf_acquire_d(d);
2466
2467         if (d->bd_bif == NULL) {
2468                 bpf_release_d(d);
2469                 lck_mtx_unlock(bpf_mlock);
2470                 return ENXIO;
2471         }
2472
2473         while (d->bd_hbuf_read != 0) {
2474                 msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
2475         }
2476
2477         if ((d->bd_flags & BPF_CLOSING) != 0) {
2478                 bpf_release_d(d);
2479                 lck_mtx_unlock(bpf_mlock);
2480                 return ENXIO;
2481         }
2482
2483         switch (which) {
2484         case FREAD:
2485                 if (d->bd_hlen != 0 ||
2486                     ((d->bd_immediate ||
2487                     d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0)) {
2488                         ret = 1;         /* read has data to return */
2489                 } else {
2490                         /*
2491                          * Read has no data to return.
2492                          * Make the select wait, and start a timer if
2493                          * necessary.
2494                          */
2495                         selrecord(p, &d->bd_sel, wql);
2496                         bpf_start_timer(d);
2497                 }
2498                 break;
2499
2500         case FWRITE:
2501                 /* can't determine whether a write would block */
2502                 ret = 1;
2503                 break;
2504         }
2505
2506         bpf_release_d(d);
2507         lck_mtx_unlock(bpf_mlock);
2508
2509         return ret;
2510 }
2511
2512 /*
2513  * Support for kevent() system call.  Register EVFILT_READ filters and
2514  * reject all others.
2515  */
2516 int bpfkqfilter(dev_t dev, struct knote *kn);
2517 static void filt_bpfdetach(struct knote *);
2518 static int filt_bpfread(struct knote *, long);
2519 static int filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev);
2520 static int filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev);
2521
2522 SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = {
2523         .f_isfd = 1,
2524         .f_detach = filt_bpfdetach,
2525         .f_event = filt_bpfread,
2526         .f_touch = filt_bpftouch,
2527         .f_process = filt_bpfprocess,
2528 };
2529
2530 static int
2531 filt_bpfread_common(struct knote *kn, struct kevent_qos_s *kev, struct bpf_d *d)
2532 {
2533         int ready = 0;
2534         int64_t data = 0;
2535
2536         if (d->bd_immediate) {
2537                 /*
2538                  * If there's data in the hold buffer, it's the
2539                  * amount of data a read will return.
2540                  *
2541                  * If there's no data in the hold buffer, but
2542                  * there's data in the store buffer, a read will
2543                  * immediately rotate the store buffer to the
2544                  * hold buffer, the amount of data in the store
2545                  * buffer is the amount of data a read will
2546                  * return.
2547                  *
2548                  * If there's no data in either buffer, we're not
2549                  * ready to read.
2550                  */
2551                 data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ?
2552                     d->bd_slen : d->bd_hlen);
2553                 int64_t lowwat = knote_low_watermark(kn);
2554                 if (lowwat > d->bd_bufsize) {
2555                         lowwat = d->bd_bufsize;
2556                 }
2557                 ready = (data >= lowwat);
2558         } else {
2559                 /*
2560                  * If there's data in the hold buffer, it's the
2561                  * amount of data a read will return.
2562                  *
2563                  * If there's no data in the hold buffer, but
2564                  * there's data in the store buffer, if the
2565                  * timer has expired a read will immediately
2566                  * rotate the store buffer to the hold buffer,
2567                  * so the amount of data in the store buffer is
2568                  * the amount of data a read will return.
2569                  *
2570                  * If there's no data in either buffer, or there's
2571                  * no data in the hold buffer and the timer hasn't
2572                  * expired, we're not ready to read.
2573                  */
2574                 data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) &&
2575                     d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen);
2576                 ready = (data > 0);
2577         }
2578         if (!ready) {
2579                 bpf_start_timer(d);
2580         } else if (kev) {
2581                 knote_fill_kevent(kn, kev, data);
2582         }
2583
2584         return ready;
2585 }
2586
2587 int
2588 bpfkqfilter(dev_t dev, struct knote *kn)
2589 {
2590         struct bpf_d *d;
2591         int res;
2592
2593         /*
2594          * Is this device a bpf?
2595          */
2596         if (major(dev) != CDEV_MAJOR || kn->kn_filter != EVFILT_READ) {
2597                 knote_set_error(kn, EINVAL);
2598                 return 0;
2599         }
2600
2601         lck_mtx_lock(bpf_mlock);
2602
2603         d = bpf_dtab[minor(dev)];
2604
2605         if (d == NULL || d == BPF_DEV_RESERVED ||
2606             (d->bd_flags & BPF_CLOSING) != 0 ||
2607             d->bd_bif == NULL) {
2608                 lck_mtx_unlock(bpf_mlock);
2609                 knote_set_error(kn, ENXIO);
2610                 return 0;
2611         }
2612
2613         kn->kn_hook = d;
2614         kn->kn_filtid = EVFILTID_BPFREAD;
2615         KNOTE_ATTACH(&d->bd_sel.si_note, kn);
2616         d->bd_flags |= BPF_KNOTE;
2617
2618         /* capture the current state */
2619         res = filt_bpfread_common(kn, NULL, d);
2620
2621         lck_mtx_unlock(bpf_mlock);
2622
2623         return res;
2624 }
2625
2626 static void
2627 filt_bpfdetach(struct knote *kn)
2628 {
2629         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2630
2631         lck_mtx_lock(bpf_mlock);
2632         if (d->bd_flags & BPF_KNOTE) {
2633                 KNOTE_DETACH(&d->bd_sel.si_note, kn);
2634                 d->bd_flags &= ~BPF_KNOTE;
2635         }
2636         lck_mtx_unlock(bpf_mlock);
2637 }
2638
2639 static int
2640 filt_bpfread(struct knote *kn, long hint)
2641 {
2642 #pragma unused(hint)
2643         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2644
2645         return filt_bpfread_common(kn, NULL, d);
2646 }
2647
2648 static int
2649 filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev)
2650 {
2651         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2652         int res;
2653
2654         lck_mtx_lock(bpf_mlock);
2655
2656         /* save off the lowat threshold and flag */
2657         kn->kn_sdata = kev->data;
2658         kn->kn_sfflags = kev->fflags;
2659
2660         /* output data will be re-generated here */
2661         res = filt_bpfread_common(kn, NULL, d);
2662
2663         lck_mtx_unlock(bpf_mlock);
2664
2665         return res;
2666 }
2667
2668 static int
2669 filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev)
2670 {
2671         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2672         int res;
2673
2674         lck_mtx_lock(bpf_mlock);
2675         res = filt_bpfread_common(kn, kev, d);
2676         lck_mtx_unlock(bpf_mlock);
2677
2678         return res;
2679 }
2680
2681 /*
2682  * Copy data from an mbuf chain into a buffer.  This code is derived
2683  * from m_copydata in kern/uipc_mbuf.c.
2684  */
2685 static void
2686 bpf_mcopy(struct mbuf * m, void *dst_arg, size_t len)
2687 {
2688         u_int count;
2689         u_char *dst;
2690
2691         dst = dst_arg;
2692         while (len > 0) {
2693                 if (m == 0) {
2694                         panic("bpf_mcopy");
2695                 }
2696                 count = min(m->m_len, len);
2697                 bcopy(mbuf_data(m), dst, count);
2698                 m = m->m_next;
2699                 dst += count;
2700                 len -= count;
2701         }
2702 }
2703
2704 static inline void
2705 bpf_tap_imp(
2706         ifnet_t         ifp,
2707         u_int32_t       dlt,
2708         struct bpf_packet *bpf_pkt,
2709         int             outbound)
2710 {
2711         struct bpf_d    *d;
2712         u_int slen;
2713         struct bpf_if *bp;
2714
2715         /*
2716          * It's possible that we get here after the bpf descriptor has been
2717          * detached from the interface; in such a case we simply return.
2718          * Lock ordering is important since we can be called asynchronously
2719          * (from IOKit) to process an inbound packet; when that happens
2720          * we would have been holding its "gateLock" and will be acquiring
2721          * "bpf_mlock" upon entering this routine.  Due to that, we release
2722          * "bpf_mlock" prior to calling ifnet_set_promiscuous (which will
2723          * acquire "gateLock" in the IOKit), in order to avoid a deadlock
2724          * when a ifnet_set_promiscuous request simultaneously collides with
2725          * an inbound packet being passed into the tap callback.
2726          */
2727         lck_mtx_lock(bpf_mlock);
2728         if (ifp->if_bpf == NULL) {
2729                 lck_mtx_unlock(bpf_mlock);
2730                 return;
2731         }
2732         for (bp = ifp->if_bpf; bp != NULL; bp = bp->bif_next) {
2733                 if (bp->bif_ifp != ifp) {
2734                         /* wrong interface */
2735                         bp = NULL;
2736                         break;
2737                 }
2738                 if (dlt == 0 || bp->bif_dlt == dlt) {
2739                         /* tapping default DLT or DLT matches */
2740                         break;
2741                 }
2742         }
2743         if (bp == NULL) {
2744                 goto done;
2745         }
2746         for (d = bp->bif_dlist; d; d = d->bd_next) {
2747                 struct bpf_packet *bpf_pkt_saved = bpf_pkt;
2748                 struct bpf_packet bpf_pkt_tmp;
2749                 struct pktap_header_buffer bpfp_header_tmp;
2750
2751                 if (outbound && !d->bd_seesent) {
2752                         continue;
2753                 }
2754
2755                 ++d->bd_rcount;
2756                 slen = bpf_filter(d->bd_filter, (u_char *)bpf_pkt,
2757                     bpf_pkt->bpfp_total_length, 0);
2758                 if (bp->bif_ifp->if_type == IFT_PKTAP &&
2759                     bp->bif_dlt == DLT_PKTAP) {
2760                         /*
2761                          * Need to copy the bpf_pkt because the conversion
2762                          * to v2 pktap header modifies the content of the
2763                          * bpfp_header
2764                          */
2765                         if ((d->bd_flags & BPF_PKTHDRV2) &&
2766                             bpf_pkt->bpfp_header_length <= sizeof(bpfp_header_tmp)) {
2767                                 bpf_pkt_tmp = *bpf_pkt;
2768
2769                                 bpf_pkt = &bpf_pkt_tmp;
2770
2771                                 memcpy(&bpfp_header_tmp, bpf_pkt->bpfp_header,
2772                                     bpf_pkt->bpfp_header_length);
2773
2774                                 bpf_pkt->bpfp_header = &bpfp_header_tmp;
2775
2776                                 convert_to_pktap_header_to_v2(bpf_pkt,
2777                                     !!(d->bd_flags & BPF_TRUNCATE));
2778                         }
2779
2780                         if (d->bd_flags & BPF_TRUNCATE) {
2781                                 slen = min(slen,
2782                                     get_pkt_trunc_len((u_char *)bpf_pkt,
2783                                     bpf_pkt->bpfp_total_length));
2784                         }
2785                 }
2786                 if (slen != 0) {
2787                         catchpacket(d, bpf_pkt, slen, outbound);
2788                 }
2789                 bpf_pkt = bpf_pkt_saved;
2790         }
2791
2792 done:
2793         lck_mtx_unlock(bpf_mlock);
2794 }
2795
2796 static inline void
2797 bpf_tap_mbuf(
2798         ifnet_t         ifp,
2799         u_int32_t       dlt,
2800         mbuf_t          m,
2801         void*           hdr,
2802         size_t          hlen,
2803         int             outbound)
2804 {
2805         struct bpf_packet bpf_pkt;
2806         struct mbuf *m0;
2807
2808         if (ifp->if_bpf == NULL) {
2809                 /* quickly check without taking lock */
2810                 return;
2811         }
2812         bpf_pkt.bpfp_type = BPF_PACKET_TYPE_MBUF;
2813         bpf_pkt.bpfp_mbuf = m;
2814         bpf_pkt.bpfp_total_length = 0;
2815         for (m0 = m; m0 != NULL; m0 = m0->m_next) {
2816                 bpf_pkt.bpfp_total_length += m0->m_len;
2817         }
2818         bpf_pkt.bpfp_header = hdr;
2819         if (hdr != NULL) {
2820                 bpf_pkt.bpfp_total_length += hlen;
2821                 bpf_pkt.bpfp_header_length = hlen;
2822         } else {
2823                 bpf_pkt.bpfp_header_length = 0;
2824         }
2825         bpf_tap_imp(ifp, dlt, &bpf_pkt, outbound);
2826 }
2827
2828 void
2829 bpf_tap_out(
2830         ifnet_t         ifp,
2831         u_int32_t       dlt,
2832         mbuf_t          m,
2833         void*           hdr,
2834         size_t          hlen)
2835 {
2836         bpf_tap_mbuf(ifp, dlt, m, hdr, hlen, 1);
2837 }
2838
2839 void
2840 bpf_tap_in(
2841         ifnet_t         ifp,
2842         u_int32_t       dlt,
2843         mbuf_t          m,
2844         void*           hdr,
2845         size_t          hlen)
2846 {
2847         bpf_tap_mbuf(ifp, dlt, m, hdr, hlen, 0);
2848 }
2849
2850 /* Callback registered with Ethernet driver. */
2851 static int
2852 bpf_tap_callback(struct ifnet *ifp, struct mbuf *m)
2853 {
2854         bpf_tap_mbuf(ifp, 0, m, NULL, 0, mbuf_pkthdr_rcvif(m) == NULL);
2855
2856         return 0;
2857 }
2858
2859
2860 static errno_t
2861 bpf_copydata(struct bpf_packet *pkt, size_t off, size_t len, void* out_data)
2862 {
2863         errno_t err = 0;
2864         if (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) {
2865                 err = mbuf_copydata(pkt->bpfp_mbuf, off, len, out_data);
2866         } else {
2867                 err = EINVAL;
2868         }
2869
2870         return err;
2871 }
2872
2873 static void
2874 copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len)
2875 {
2876         /* copy the optional header */
2877         if (pkt->bpfp_header_length != 0) {
2878                 size_t  count = min(len, pkt->bpfp_header_length);
2879                 bcopy(pkt->bpfp_header, dst, count);
2880                 len -= count;
2881                 dst += count;
2882         }
2883         if (len == 0) {
2884                 /* nothing past the header */
2885                 return;
2886         }
2887         /* copy the packet */
2888         switch (pkt->bpfp_type) {
2889         case BPF_PACKET_TYPE_MBUF:
2890                 bpf_mcopy(pkt->bpfp_mbuf, dst, len);
2891                 break;
2892         default:
2893                 break;
2894         }
2895 }
2896
2897 static uint16_t
2898 get_esp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
2899     const uint16_t remaining_caplen)
2900 {
2901         /*
2902          * For some reason tcpdump expects to have one byte beyond the ESP header
2903          */
2904         uint16_t trunc_len = ESP_HDR_SIZE + 1;
2905
2906         if (trunc_len > remaining_caplen) {
2907                 return remaining_caplen;
2908         }
2909
2910         return trunc_len;
2911 }
2912
2913 static uint16_t
2914 get_isakmp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
2915     const uint16_t remaining_caplen)
2916 {
2917         /*
2918          * Include the payload generic header
2919          */
2920         uint16_t trunc_len = ISAKMP_HDR_SIZE;
2921
2922         if (trunc_len > remaining_caplen) {
2923                 return remaining_caplen;
2924         }
2925
2926         return trunc_len;
2927 }
2928
2929 static uint16_t
2930 get_isakmp_natt_trunc_len(struct bpf_packet *pkt, uint16_t off,
2931     const uint16_t remaining_caplen)
2932 {
2933         int err = 0;
2934         uint16_t trunc_len = 0;
2935         char payload[remaining_caplen];
2936
2937         err = bpf_copydata(pkt, off, remaining_caplen, payload);
2938         if (err != 0) {
2939                 return remaining_caplen;
2940         }
2941         /*
2942          * They are three cases:
2943          * - IKE: payload start with 4 bytes header set to zero before ISAKMP header
2944          * - keep alive: 1 byte payload
2945          * - otherwise it's ESP
2946          */
2947         if (remaining_caplen >= 4 &&
2948             payload[0] == 0 && payload[1] == 0 &&
2949             payload[2] == 0 && payload[3] == 0) {
2950                 trunc_len = 4 + get_isakmp_trunc_len(pkt, off + 4, remaining_caplen - 4);
2951         } else if (remaining_caplen == 1) {
2952                 trunc_len = 1;
2953         } else {
2954                 trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
2955         }
2956
2957         if (trunc_len > remaining_caplen) {
2958                 return remaining_caplen;
2959         }
2960
2961         return trunc_len;
2962 }
2963
2964 static uint16_t
2965 get_udp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
2966 {
2967         int err = 0;
2968         uint16_t trunc_len = sizeof(struct udphdr); /* By default no UDP payload */
2969
2970         if (trunc_len >= remaining_caplen) {
2971                 return remaining_caplen;
2972         }
2973
2974         struct udphdr udphdr;
2975         err = bpf_copydata(pkt, off, sizeof(struct udphdr), &udphdr);
2976         if (err != 0) {
2977                 return remaining_caplen;
2978         }
2979
2980         u_short sport, dport;
2981
2982         sport = EXTRACT_SHORT(&udphdr.uh_sport);
2983         dport = EXTRACT_SHORT(&udphdr.uh_dport);
2984
2985         if (dport == PORT_DNS || sport == PORT_DNS) {
2986                 /*
2987                  * Full UDP payload for DNS
2988                  */
2989                 trunc_len = remaining_caplen;
2990         } else if ((sport == PORT_BOOTPS && dport == PORT_BOOTPC) ||
2991             (sport == PORT_BOOTPC && dport == PORT_BOOTPS)) {
2992                 /*
2993                  * Full UDP payload for BOOTP and DHCP
2994                  */
2995                 trunc_len = remaining_caplen;
2996         } else if (dport == PORT_ISAKMP && sport == PORT_ISAKMP) {
2997                 /*
2998                  * Return the ISAKMP header
2999                  */
3000                 trunc_len += get_isakmp_trunc_len(pkt, off + sizeof(struct udphdr),
3001                     remaining_caplen - sizeof(struct udphdr));
3002         } else if (dport == PORT_ISAKMP_NATT && sport == PORT_ISAKMP_NATT) {
3003                 trunc_len += get_isakmp_natt_trunc_len(pkt, off + sizeof(struct udphdr),
3004                     remaining_caplen - sizeof(struct udphdr));
3005         }
3006         if (trunc_len >= remaining_caplen) {
3007                 return remaining_caplen;
3008         }
3009
3010         return trunc_len;
3011 }
3012
3013 static uint16_t
3014 get_tcp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3015 {
3016         int err = 0;
3017         uint16_t trunc_len = sizeof(struct tcphdr); /* By default no TCP payload */
3018         if (trunc_len >= remaining_caplen) {
3019                 return remaining_caplen;
3020         }
3021
3022         struct tcphdr tcphdr;
3023         err = bpf_copydata(pkt, off, sizeof(struct tcphdr), &tcphdr);
3024         if (err != 0) {
3025                 return remaining_caplen;
3026         }
3027
3028         u_short sport, dport;
3029         sport = EXTRACT_SHORT(&tcphdr.th_sport);
3030         dport = EXTRACT_SHORT(&tcphdr.th_dport);
3031
3032         if (dport == PORT_DNS || sport == PORT_DNS) {
3033                 /*
3034                  * Full TCP payload  for DNS
3035                  */
3036                 trunc_len = remaining_caplen;
3037         } else {
3038                 trunc_len = tcphdr.th_off << 2;
3039         }
3040         if (trunc_len >= remaining_caplen) {
3041                 return remaining_caplen;
3042         }
3043
3044         return trunc_len;
3045 }
3046
3047 static uint16_t
3048 get_proto_trunc_len(uint8_t proto, struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3049 {
3050         uint16_t trunc_len;
3051
3052         switch (proto) {
3053         case IPPROTO_ICMP: {
3054                 /*
3055                  * Full IMCP payload
3056                  */
3057                 trunc_len = remaining_caplen;
3058                 break;
3059         }
3060         case IPPROTO_ICMPV6: {
3061                 /*
3062                  * Full IMCPV6 payload
3063                  */
3064                 trunc_len = remaining_caplen;
3065                 break;
3066         }
3067         case IPPROTO_IGMP: {
3068                 /*
3069                  * Full IGMP payload
3070                  */
3071                 trunc_len = remaining_caplen;
3072                 break;
3073         }
3074         case IPPROTO_UDP: {
3075                 trunc_len = get_udp_trunc_len(pkt, off, remaining_caplen);
3076                 break;
3077         }
3078         case IPPROTO_TCP: {
3079                 trunc_len = get_tcp_trunc_len(pkt, off, remaining_caplen);
3080                 break;
3081         }
3082         case IPPROTO_ESP: {
3083                 trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
3084                 break;
3085         }
3086         default: {
3087                 /*
3088                  * By default we only include the IP header
3089                  */
3090                 trunc_len = 0;
3091                 break;
3092         }
3093         }
3094         if (trunc_len >= remaining_caplen) {
3095                 return remaining_caplen;
3096         }
3097
3098         return trunc_len;
3099 }
3100
3101 static uint16_t
3102 get_ip_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3103 {
3104         int err = 0;
3105         uint16_t iplen = sizeof(struct ip);
3106         if (iplen >= remaining_caplen) {
3107                 return remaining_caplen;
3108         }
3109
3110         struct ip iphdr;
3111         err =  bpf_copydata(pkt, off, sizeof(struct ip), &iphdr);
3112         if (err != 0) {
3113                 return remaining_caplen;
3114         }
3115
3116         uint8_t proto = 0;
3117
3118         iplen = iphdr.ip_hl << 2;
3119         if (iplen >= remaining_caplen) {
3120                 return remaining_caplen;
3121         }
3122
3123         proto = iphdr.ip_p;
3124         iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
3125
3126         if (iplen >= remaining_caplen) {
3127                 return remaining_caplen;
3128         }
3129
3130         return iplen;
3131 }
3132
3133 static uint16_t
3134 get_ip6_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
3135 {
3136         int err = 0;
3137         uint16_t iplen = sizeof(struct ip6_hdr);
3138         if (iplen >= remaining_caplen) {
3139                 return remaining_caplen;
3140         }
3141
3142         struct ip6_hdr ip6hdr;
3143         err = bpf_copydata(pkt, off, sizeof(struct ip6_hdr), &ip6hdr);
3144         if (err != 0) {
3145                 return remaining_caplen;
3146         }
3147
3148         uint8_t proto = 0;
3149
3150         /*
3151          * TBD: process the extension headers
3152          */
3153         proto = ip6hdr.ip6_nxt;
3154         iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
3155
3156         if (iplen >= remaining_caplen) {
3157                 return remaining_caplen;
3158         }
3159
3160         return iplen;
3161 }
3162
3163 static uint16_t
3164 get_ether_trunc_len(struct bpf_packet *pkt, int off, const uint16_t remaining_caplen)
3165 {
3166         int err = 0;
3167         uint16_t ethlen = sizeof(struct ether_header);
3168         if (ethlen >= remaining_caplen) {
3169                 return remaining_caplen;
3170         }
3171
3172         struct ether_header eh;
3173         u_short type;
3174         err = bpf_copydata(pkt, off, sizeof(struct ether_header), &eh);
3175         if (err != 0) {
3176                 return remaining_caplen;
3177         }
3178
3179         type = EXTRACT_SHORT(&eh.ether_type);
3180         /* Include full ARP */
3181         if (type == ETHERTYPE_ARP) {
3182                 ethlen = remaining_caplen;
3183         } else if (type != ETHERTYPE_IP && type != ETHERTYPE_IPV6) {
3184                 ethlen = min(BPF_MIN_PKT_SIZE, remaining_caplen);
3185         } else {
3186                 if (type == ETHERTYPE_IP) {
3187                         ethlen += get_ip_trunc_len(pkt, sizeof(struct ether_header),
3188                             remaining_caplen);
3189                 } else if (type == ETHERTYPE_IPV6) {
3190                         ethlen += get_ip6_trunc_len(pkt, sizeof(struct ether_header),
3191                             remaining_caplen);
3192                 }
3193         }
3194         return ethlen;
3195 }
3196
3197 static uint32_t
3198 get_pkt_trunc_len(u_char *p, u_int len)
3199 {
3200         struct bpf_packet *pkt = (struct bpf_packet *)(void *) p;
3201         struct pktap_header *pktap = (struct pktap_header *) (pkt->bpfp_header);
3202         uint32_t out_pkt_len = 0, tlen = 0;
3203         /*
3204          * pktap->pth_frame_pre_length is L2 header length and accounts
3205          * for both pre and pre_adjust.
3206          * pktap->pth_length is sizeof(pktap_header) (excl the pre/pre_adjust)
3207          * pkt->bpfp_header_length is (pktap->pth_length + pre_adjust)
3208          * pre is the offset to the L3 header after the bpfp_header, or length
3209          * of L2 header after bpfp_header, if present.
3210          */
3211         int32_t pre = pktap->pth_frame_pre_length -
3212             (pkt->bpfp_header_length - pktap->pth_length);
3213
3214         /* Length of the input packet starting from  L3 header */
3215         uint32_t in_pkt_len = len - pkt->bpfp_header_length - pre;
3216         if (pktap->pth_protocol_family == AF_INET ||
3217             pktap->pth_protocol_family == AF_INET6) {
3218                 /* Contains L2 header */
3219                 if (pre > 0) {
3220                         if (pre < (int32_t)sizeof(struct ether_header)) {
3221                                 goto too_short;
3222                         }
3223
3224                         out_pkt_len = get_ether_trunc_len(pkt, 0, in_pkt_len);
3225                 } else if (pre == 0) {
3226                         if (pktap->pth_protocol_family == AF_INET) {
3227                                 out_pkt_len = get_ip_trunc_len(pkt, pre, in_pkt_len);
3228                         } else if (pktap->pth_protocol_family == AF_INET6) {
3229                                 out_pkt_len = get_ip6_trunc_len(pkt, pre, in_pkt_len);
3230                         }
3231                 } else {
3232                         /* Ideally pre should be >= 0. This is an exception */
3233                         out_pkt_len = min(BPF_MIN_PKT_SIZE, in_pkt_len);
3234                 }
3235         } else {
3236                 if (pktap->pth_iftype == IFT_ETHER) {
3237                         if (in_pkt_len < sizeof(struct ether_header)) {
3238                                 goto too_short;
3239                         }
3240                         /* At most include the Ethernet header and 16 bytes */
3241                         out_pkt_len = MIN(sizeof(struct ether_header) + 16,
3242                             in_pkt_len);
3243                 } else {
3244                         /*
3245                          * For unknown protocols include at most 16 bytes
3246                          */
3247                         out_pkt_len = MIN(16, in_pkt_len);
3248                 }
3249         }
3250 done:
3251         tlen = pkt->bpfp_header_length + out_pkt_len + pre;
3252         return tlen;
3253 too_short:
3254         out_pkt_len = in_pkt_len;
3255         goto done;
3256 }
3257
3258 /*
3259  * Move the packet data from interface memory (pkt) into the
3260  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
3261  * otherwise 0.
3262  */
3263 static void
3264 catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
3265     u_int snaplen, int outbound)
3266 {
3267         struct bpf_hdr *hp;
3268         struct bpf_hdr_ext *ehp;
3269         int totlen, curlen;
3270         int hdrlen, caplen;
3271         int do_wakeup = 0;
3272         u_char *payload;
3273         struct timeval tv;
3274
3275         hdrlen = (d->bd_flags & BPF_EXTENDED_HDR) ? d->bd_bif->bif_exthdrlen :
3276             d->bd_bif->bif_hdrlen;
3277         /*
3278          * Figure out how many bytes to move.  If the packet is
3279          * greater or equal to the snapshot length, transfer that
3280          * much.  Otherwise, transfer the whole packet (unless
3281          * we hit the buffer size limit).
3282          */
3283         totlen = hdrlen + min(snaplen, pkt->bpfp_total_length);
3284         if (totlen > d->bd_bufsize) {
3285                 totlen = d->bd_bufsize;
3286         }
3287
3288         if (hdrlen > totlen) {
3289                 return;
3290         }
3291
3292         /*
3293          * Round up the end of the previous packet to the next longword.
3294          */
3295         curlen = BPF_WORDALIGN(d->bd_slen);
3296         if (curlen + totlen > d->bd_bufsize) {
3297                 /*
3298                  * This packet will overflow the storage buffer.
3299                  * Rotate the buffers if we can, then wakeup any
3300                  * pending reads.
3301                  *
3302                  * We cannot rotate buffers if a read is in progress
3303                  * so drop the packet
3304                  */
3305                 if (d->bd_hbuf_read != 0) {
3306                         ++d->bd_dcount;
3307                         return;
3308                 }
3309
3310                 if (d->bd_fbuf == NULL) {
3311                         if (d->bd_headdrop == 0) {
3312                                 /*
3313                                  * We haven't completed the previous read yet,
3314                                  * so drop the packet.
3315                                  */
3316                                 ++d->bd_dcount;
3317                                 return;
3318                         }
3319                         /*
3320                          * Drop the hold buffer as it contains older packets
3321                          */
3322                         d->bd_dcount += d->bd_hcnt;
3323                         d->bd_fbuf = d->bd_hbuf;
3324                         ROTATE_BUFFERS(d);
3325                 } else {
3326                         ROTATE_BUFFERS(d);
3327                 }
3328                 do_wakeup = 1;
3329                 curlen = 0;
3330         } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
3331                 /*
3332                  * Immediate mode is set, or the read timeout has
3333                  * already expired during a select call. A packet
3334                  * arrived, so the reader should be woken up.
3335                  */
3336                 do_wakeup = 1;
3337         }
3338
3339         /*
3340          * Append the bpf header.
3341          */
3342         microtime(&tv);
3343         if (d->bd_flags & BPF_EXTENDED_HDR) {
3344                 struct mbuf *m;
3345
3346                 m = (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF)
3347                     ? pkt->bpfp_mbuf : NULL;
3348                 ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen);
3349                 memset(ehp, 0, sizeof(*ehp));
3350                 ehp->bh_tstamp.tv_sec = tv.tv_sec;
3351                 ehp->bh_tstamp.tv_usec = tv.tv_usec;
3352
3353                 ehp->bh_datalen = pkt->bpfp_total_length;
3354                 ehp->bh_hdrlen = hdrlen;
3355                 caplen = ehp->bh_caplen = totlen - hdrlen;
3356                 if (m == NULL) {
3357                         if (outbound) {
3358                                 ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT;
3359                         } else {
3360                                 ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN;
3361                         }
3362                 } else if (outbound) {
3363                         ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT;
3364
3365                         /* only do lookups on non-raw INPCB */
3366                         if ((m->m_pkthdr.pkt_flags & (PKTF_FLOW_ID |
3367                             PKTF_FLOW_LOCALSRC | PKTF_FLOW_RAWSOCK)) ==
3368                             (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC) &&
3369                             m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
3370                                 ehp->bh_flowid = m->m_pkthdr.pkt_flowid;
3371                                 ehp->bh_proto = m->m_pkthdr.pkt_proto;
3372                         }
3373                         ehp->bh_svc = so_svc2tc(m->m_pkthdr.pkt_svc);
3374                         if (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) {
3375                                 ehp->bh_pktflags |= BPF_PKTFLAGS_TCP_REXMT;
3376                         }
3377                         if (m->m_pkthdr.pkt_flags & PKTF_START_SEQ) {
3378                                 ehp->bh_pktflags |= BPF_PKTFLAGS_START_SEQ;
3379                         }
3380                         if (m->m_pkthdr.pkt_flags & PKTF_LAST_PKT) {
3381                                 ehp->bh_pktflags |= BPF_PKTFLAGS_LAST_PKT;
3382                         }
3383                         if (m->m_pkthdr.pkt_flags & PKTF_VALID_UNSENT_DATA) {
3384                                 ehp->bh_unsent_bytes =
3385                                     m->m_pkthdr.bufstatus_if;
3386                                 ehp->bh_unsent_snd =
3387                                     m->m_pkthdr.bufstatus_sndbuf;
3388                         }
3389                 } else {
3390                         ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN;
3391                 }
3392                 payload = (u_char *)ehp + hdrlen;
3393         } else {
3394                 hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen);
3395                 hp->bh_tstamp.tv_sec = tv.tv_sec;
3396                 hp->bh_tstamp.tv_usec = tv.tv_usec;
3397                 hp->bh_datalen = pkt->bpfp_total_length;
3398                 hp->bh_hdrlen = hdrlen;
3399                 caplen = hp->bh_caplen = totlen - hdrlen;
3400                 payload = (u_char *)hp + hdrlen;
3401         }
3402         /*
3403          * Copy the packet data into the store buffer and update its length.
3404          */
3405         copy_bpf_packet(pkt, payload, caplen);
3406         d->bd_slen = curlen + totlen;
3407         d->bd_scnt += 1;
3408
3409         if (do_wakeup) {
3410                 bpf_wakeup(d);
3411         }
3412 }
3413
3414 /*
3415  * Initialize all nonzero fields of a descriptor.
3416  */
3417 static int
3418 bpf_allocbufs(struct bpf_d *d)
3419 {
3420         if (d->bd_sbuf != NULL) {
3421                 FREE(d->bd_sbuf, M_DEVBUF);
3422                 d->bd_sbuf = NULL;
3423         }
3424         if (d->bd_hbuf != NULL) {
3425                 FREE(d->bd_hbuf, M_DEVBUF);
3426                 d->bd_hbuf = NULL;
3427         }
3428         if (d->bd_fbuf != NULL) {
3429                 FREE(d->bd_fbuf, M_DEVBUF);
3430                 d->bd_fbuf = NULL;
3431         }
3432
3433         d->bd_fbuf = (caddr_t) _MALLOC(d->bd_bufsize, M_DEVBUF, M_WAIT);
3434         if (d->bd_fbuf == NULL) {
3435                 return ENOBUFS;
3436         }
3437
3438         d->bd_sbuf = (caddr_t) _MALLOC(d->bd_bufsize, M_DEVBUF, M_WAIT);
3439         if (d->bd_sbuf == NULL) {
3440                 FREE(d->bd_fbuf, M_DEVBUF);
3441                 d->bd_fbuf = NULL;
3442                 return ENOBUFS;
3443         }
3444         d->bd_slen = 0;
3445         d->bd_hlen = 0;
3446         d->bd_scnt = 0;
3447         d->bd_hcnt = 0;
3448         return 0;
3449 }
3450
3451 /*
3452  * Free buffers currently in use by a descriptor.
3453  * Called on close.
3454  */
3455 static void
3456 bpf_freed(struct bpf_d *d)
3457 {
3458         /*
3459          * We don't need to lock out interrupts since this descriptor has
3460          * been detached from its interface and it yet hasn't been marked
3461          * free.
3462          */
3463         if (d->bd_hbuf_read != 0) {
3464                 panic("bpf buffer freed during read");
3465         }
3466
3467         if (d->bd_sbuf != 0) {
3468                 FREE(d->bd_sbuf, M_DEVBUF);
3469                 if (d->bd_hbuf != 0) {
3470                         FREE(d->bd_hbuf, M_DEVBUF);
3471                 }
3472                 if (d->bd_fbuf != 0) {
3473                         FREE(d->bd_fbuf, M_DEVBUF);
3474                 }
3475         }
3476         if (d->bd_filter) {
3477                 FREE(d->bd_filter, M_DEVBUF);
3478         }
3479 }
3480
3481 /*
3482  * Attach an interface to bpf.  driverp is a pointer to a (struct bpf_if *)
3483  * in the driver's softc; dlt is the link layer type; hdrlen is the fixed
3484  * size of the link header (variable length headers not yet supported).
3485  */
3486 void
3487 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3488 {
3489         bpf_attach(ifp, dlt, hdrlen, NULL, NULL);
3490 }
3491
3492 errno_t
3493 bpf_attach(
3494         ifnet_t                 ifp,
3495         u_int32_t               dlt,
3496         u_int32_t               hdrlen,
3497         bpf_send_func   send,
3498         bpf_tap_func    tap)
3499 {
3500         struct bpf_if *bp;
3501         struct bpf_if *bp_new;
3502         struct bpf_if *bp_before_first = NULL;
3503         struct bpf_if *bp_first = NULL;
3504         struct bpf_if *bp_last = NULL;
3505         boolean_t found;
3506
3507         bp_new = (struct bpf_if *) _MALLOC(sizeof(*bp_new), M_DEVBUF,
3508             M_WAIT | M_ZERO);
3509         if (bp_new == 0) {
3510                 panic("bpfattach");
3511         }
3512
3513         lck_mtx_lock(bpf_mlock);
3514
3515         /*
3516          * Check if this interface/dlt is already attached. Remember the
3517          * first and last attachment for this interface, as well as the
3518          * element before the first attachment.
3519          */
3520         found = FALSE;
3521         for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
3522                 if (bp->bif_ifp != ifp) {
3523                         if (bp_first != NULL) {
3524                                 /* no more elements for this interface */
3525                                 break;
3526                         }
3527                         bp_before_first = bp;
3528                 } else {
3529                         if (bp->bif_dlt == dlt) {
3530                                 found = TRUE;
3531                                 break;
3532                         }
3533                         if (bp_first == NULL) {
3534                                 bp_first = bp;
3535                         }
3536                         bp_last = bp;
3537                 }
3538         }
3539         if (found) {
3540                 lck_mtx_unlock(bpf_mlock);
3541                 printf("bpfattach - %s with dlt %d is already attached\n",
3542                     if_name(ifp), dlt);
3543                 FREE(bp_new, M_DEVBUF);
3544                 return EEXIST;
3545         }
3546
3547         bp_new->bif_ifp = ifp;
3548         bp_new->bif_dlt = dlt;
3549         bp_new->bif_send = send;
3550         bp_new->bif_tap = tap;
3551
3552         if (bp_first == NULL) {
3553                 /* No other entries for this ifp */
3554                 bp_new->bif_next = bpf_iflist;
3555                 bpf_iflist = bp_new;
3556         } else {
3557                 if (ifnet_type(ifp) == IFT_ETHER && dlt == DLT_EN10MB) {
3558                         /* Make this the first entry for this interface */
3559                         if (bp_before_first != NULL) {
3560                                 /*  point the previous to us */
3561                                 bp_before_first->bif_next = bp_new;
3562                         } else {
3563                                 /* we're the new head */
3564                                 bpf_iflist = bp_new;
3565                         }
3566                         bp_new->bif_next = bp_first;
3567                 } else {
3568                         /* Add this after the last entry for this interface */
3569                         bp_new->bif_next = bp_last->bif_next;
3570                         bp_last->bif_next = bp_new;
3571                 }
3572         }
3573
3574         /*
3575          * Compute the length of the bpf header.  This is not necessarily
3576          * equal to SIZEOF_BPF_HDR because we want to insert spacing such
3577          * that the network layer header begins on a longword boundary (for
3578          * performance reasons and to alleviate alignment restrictions).
3579          */
3580         bp_new->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
3581         bp_new->bif_exthdrlen = BPF_WORDALIGN(hdrlen +
3582             sizeof(struct bpf_hdr_ext)) - hdrlen;
3583
3584         /* Take a reference on the interface */
3585         ifnet_reference(ifp);
3586
3587         lck_mtx_unlock(bpf_mlock);
3588
3589 #ifndef __APPLE__
3590         if (bootverbose) {
3591                 printf("bpf: %s attached\n", if_name(ifp));
3592         }
3593 #endif
3594
3595         return 0;
3596 }
3597
3598 /*
3599  * Detach bpf from an interface.  This involves detaching each descriptor
3600  * associated with the interface, and leaving bd_bif NULL.  Notify each
3601  * descriptor as it's detached so that any sleepers wake up and get
3602  * ENXIO.
3603  */
3604 void
3605 bpfdetach(struct ifnet *ifp)
3606 {
3607         struct bpf_if   *bp, *bp_prev, *bp_next;
3608         struct bpf_d    *d;
3609
3610         if (bpf_debug != 0) {
3611                 printf("%s: %s\n", __func__, if_name(ifp));
3612         }
3613
3614         lck_mtx_lock(bpf_mlock);
3615
3616         /*
3617          * Build the list of devices attached to that interface
3618          * that we need to free while keeping the lock to maintain
3619          * the integrity of the interface list
3620          */
3621         bp_prev = NULL;
3622         for (bp = bpf_iflist; bp != NULL; bp = bp_next) {
3623                 bp_next = bp->bif_next;
3624
3625                 if (ifp != bp->bif_ifp) {
3626                         bp_prev = bp;
3627                         continue;
3628                 }
3629                 /* Unlink from the interface list */
3630                 if (bp_prev) {
3631                         bp_prev->bif_next = bp->bif_next;
3632                 } else {
3633                         bpf_iflist = bp->bif_next;
3634                 }
3635
3636                 /* Detach the devices attached to the interface */
3637                 while ((d = bp->bif_dlist) != NULL) {
3638                         /*
3639                          * Take an extra reference to prevent the device
3640                          * from being freed when bpf_detachd() releases
3641                          * the reference for the interface list
3642                          */
3643                         bpf_acquire_d(d);
3644                         bpf_detachd(d, 0);
3645                         bpf_wakeup(d);
3646                         bpf_release_d(d);
3647                 }
3648                 ifnet_release(ifp);
3649         }
3650
3651         lck_mtx_unlock(bpf_mlock);
3652 }
3653
3654 void
3655 bpf_init(__unused void *unused)
3656 {
3657 #ifdef __APPLE__
3658         int     i;
3659         int     maj;
3660
3661         if (bpf_devsw_installed == 0) {
3662                 bpf_devsw_installed = 1;
3663                 bpf_mlock_grp_attr = lck_grp_attr_alloc_init();
3664                 bpf_mlock_grp = lck_grp_alloc_init("bpf", bpf_mlock_grp_attr);
3665                 bpf_mlock_attr = lck_attr_alloc_init();
3666                 lck_mtx_init(bpf_mlock, bpf_mlock_grp, bpf_mlock_attr);
3667                 maj = cdevsw_add(CDEV_MAJOR, &bpf_cdevsw);
3668                 if (maj == -1) {
3669                         if (bpf_mlock_attr) {
3670                                 lck_attr_free(bpf_mlock_attr);
3671                         }
3672                         if (bpf_mlock_grp) {
3673                                 lck_grp_free(bpf_mlock_grp);
3674                         }
3675                         if (bpf_mlock_grp_attr) {
3676                                 lck_grp_attr_free(bpf_mlock_grp_attr);
3677                         }
3678
3679                         bpf_mlock = NULL;
3680                         bpf_mlock_attr = NULL;
3681                         bpf_mlock_grp = NULL;
3682                         bpf_mlock_grp_attr = NULL;
3683                         bpf_devsw_installed = 0;
3684                         printf("bpf_init: failed to allocate a major number\n");
3685                         return;
3686                 }
3687
3688                 for (i = 0; i < NBPFILTER; i++) {
3689                         bpf_make_dev_t(maj);
3690                 }
3691         }
3692 #else
3693         cdevsw_add(&bpf_cdevsw);
3694 #endif
3695 }
3696
3697 #ifndef __APPLE__
3698 SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL);
3699 #endif
3700
3701 static int
3702 sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS
3703 {
3704 #pragma unused(arg1, arg2)
3705         int i, err;
3706
3707         i = bpf_maxbufsize;
3708
3709         err = sysctl_handle_int(oidp, &i, 0, req);
3710         if (err != 0 || req->newptr == USER_ADDR_NULL) {
3711                 return err;
3712         }
3713
3714         if (i < 0 || i > BPF_MAXSIZE_CAP) {
3715                 i = BPF_MAXSIZE_CAP;
3716         }
3717
3718         bpf_maxbufsize = i;
3719         return err;
3720 }