]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/net/dlil.c
xnu-2050.7.9.tar.gz
[apple/xnu.git] / bsd / net / dlil.c
index 9cca104a1b7aa9f06fbd453a599b894f337f4ff2..5da68ac6881f3ed9cbc2c6630ab0f987ad9bba52 100644 (file)
 #include <sys/domain.h>
 #include <sys/user.h>
 #include <sys/random.h>
+#include <sys/socketvar.h>
 #include <net/if_dl.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <net/if_var.h>
 #include <net/dlil.h>
 #include <net/if_arp.h>
+#include <net/iptap.h>
 #include <sys/kern_event.h>
 #include <sys/kdebug.h>
 #include <sys/mcache.h>
 #include <net/if_types.h>
 #include <net/if_llreach.h>
 #include <net/kpi_interfacefilter.h>
+#include <net/classq/classq.h>
+#include <net/classq/classq_sfb.h>
 
 #if INET
 #include <netinet/in_var.h>
 #include <netinet/igmp_var.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/if_ether.h>
+#include <netinet/in_pcb.h>
 #endif /* INET */
 
 #if INET6
 #if PF
 #include <net/pfvar.h>
 #endif /* PF */
+#if PF_ALTQ
+#include <net/altq/altq.h>
+#endif /* PF_ALTQ */
+#include <net/pktsched/pktsched.h>
 
 #define DBG_LAYER_BEG          DLILDBG_CODE(DBG_DLIL_STATIC, 0)
 #define DBG_LAYER_END          DLILDBG_CODE(DBG_DLIL_STATIC, 2)
 #define DLIL_PRINTF    kprintf
 #endif
 
-#define        _CASSERT(x)     \
-       switch (0) { case 0: case (x): ; }
-
 #define        IF_DATA_REQUIRE_ALIGNED_64(f)   \
        _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
 
 #define        IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
        _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
 
-#define IFNET_IF_TC_REQUIRE_ALIGNED_64(f) \
-       _CASSERT(!(offsetof(struct ifnet, if_tc.f) % sizeof (u_int64_t)))
-
 enum {
        kProtoKPI_v1    = 1,
        kProtoKPI_v2    = 2
@@ -171,7 +180,7 @@ SLIST_HEAD(proto_hash_entry, if_proto);
 struct dlil_ifnet {
        struct ifnet    dl_if;                  /* public ifnet */
        /*
-        * dlil private fields, protected by dl_if_lock
+        * DLIL private fields, protected by dl_if_lock
         */
        decl_lck_mtx_data(, dl_if_lock);
        TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
@@ -186,6 +195,8 @@ struct dlil_ifnet {
                u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
                u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
        } dl_if_lladdr;
+       u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
+       struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
        ctrace_t        dl_if_attach;           /* attach PC stacktrace */
        ctrace_t        dl_if_detach;           /* detach PC stacktrace */
 };
@@ -234,12 +245,26 @@ static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
 static lck_grp_t *dlil_lock_group;
 lck_grp_t *ifnet_lock_group;
 static lck_grp_t *ifnet_head_lock_group;
+static lck_grp_t *ifnet_snd_lock_group;
+static lck_grp_t *ifnet_rcv_lock_group;
 lck_attr_t *ifnet_lock_attr;
 decl_lck_rw_data(static, ifnet_head_lock);
 decl_lck_mtx_data(static, dlil_ifnet_lock);
 u_int32_t dlil_filter_count = 0;
 extern u_int32_t       ipv4_ll_arp_aware;
 
+struct sfb_fc_list ifnet_fclist;
+decl_lck_mtx_data(static, ifnet_fclist_lock);
+
+static unsigned int ifnet_fcezone_size;                /* size of ifnet_fce */
+static struct zone *ifnet_fcezone;             /* zone for ifnet_fce */
+
+#define IFNET_FCEZONE_MAX      32              /* maximum elements in zone */
+#define IFNET_FCEZONE_NAME     "ifnet_fcezone" /* zone name */
+
+static void ifnet_fc_thread_func(void *, wait_result_t);
+static void ifnet_fc_init(void);
+
 #if DEBUG
 static unsigned int ifnet_debug = 1;   /* debugging (enabled) */
 #else
@@ -258,12 +283,6 @@ static struct zone *dlif_filt_zone;        /* zone for ifnet_filter */
 #define        DLIF_FILT_ZONE_MAX      8               /* maximum elements in zone */
 #define        DLIF_FILT_ZONE_NAME     "ifnet_filter"  /* zone name */
 
-static unsigned int dlif_inp_size;     /* size of dlil_threading_info */
-static struct zone *dlif_inp_zone;     /* zone for dlil_threading_info */
-
-#define        DLIF_INP_ZONE_MAX       DLIF_ZONE_MAX   /* maximum elements in zone */
-#define        DLIF_INP_ZONE_NAME      "ifnet_thread"  /* zone name */
-
 static unsigned int dlif_phash_size;   /* size of ifnet proto hash table */
 static struct zone *dlif_phash_zone;   /* zone for ifnet proto hash table */
 
@@ -276,6 +295,20 @@ static struct zone *dlif_proto_zone;       /* zone for if_proto */
 #define        DLIF_PROTO_ZONE_MAX     (DLIF_ZONE_MAX*2) /* maximum elements in zone */
 #define        DLIF_PROTO_ZONE_NAME    "ifnet_proto"   /* zone name */
 
+static unsigned int dlif_tcpstat_size;         /* size of tcpstat_local to allocate */
+static unsigned int dlif_tcpstat_bufsize;      /* size of dlif_tcpstat_size + headroom */
+static struct zone *dlif_tcpstat_zone;         /* zone for tcpstat_local */
+
+#define        DLIF_TCPSTAT_ZONE_MAX   1               /* maximum elements in zone */
+#define        DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
+
+static unsigned int dlif_udpstat_size;         /* size of udpstat_local to allocate */
+static unsigned int dlif_udpstat_bufsize;      /* size of dlif_udpstat_size + headroom */
+static struct zone *dlif_udpstat_zone;         /* zone for udpstat_local */
+
+#define        DLIF_UDPSTAT_ZONE_MAX   1               /* maximum elements in zone */
+#define        DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
+
 /*
  * Updating this variable should be done by first acquiring the global
  * radix node head (rnh_lock), in tandem with settting/clearing the
@@ -284,11 +317,9 @@ static struct zone *dlif_proto_zone;       /* zone for if_proto */
 u_int32_t ifnet_aggressive_drainers;
 static u_int32_t net_rtref;
 
-static struct dlil_threading_info dlil_lo_thread;
-__private_extern__  struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_thread;
-
-static struct mbuf *dlil_lo_input_mbuf_head = NULL;
-static struct mbuf *dlil_lo_input_mbuf_tail = NULL;
+static struct dlil_main_threading_info dlil_main_input_thread_info;
+__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
+    (struct dlil_threading_info *)&dlil_main_input_thread_info;
 
 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg);
 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
@@ -327,6 +358,10 @@ static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
     const struct sockaddr_dl *, const struct sockaddr *);
 
 static errno_t ifp_if_output(struct ifnet *, struct mbuf *);
+static void ifp_if_start(struct ifnet *);
+static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
+    struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
+static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
     protocol_family_t *);
 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
@@ -334,20 +369,42 @@ static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
-    const struct sockaddr *, const char *, const char *);
-static errno_t ifp_if_ioctl(struct ifnet *, unsigned long, void *);
+    const struct sockaddr *, const char *, const char *
+#if CONFIG_EMBEDDED
+    ,
+    u_int32_t *, u_int32_t *
+#endif /* CONFIG_EMBEDDED */
+    );
 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
 static void ifp_if_free(struct ifnet *);
 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
+static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
+static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
 
-static void dlil_input_thread_func(struct dlil_threading_info *inpthread);
+static void dlil_main_input_thread_func(void *, wait_result_t);
+static void dlil_input_thread_func(void *, wait_result_t);
+static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
+static void dlil_rxpoll_calc_limits(struct dlil_threading_info *);
 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *);
-
-static void ifnet_delayed_thread_func(void);
+static void dlil_terminate_input_thread(struct dlil_threading_info *);
+static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
+    struct dlil_threading_info *, boolean_t);
+static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *);
+static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
+    u_int32_t, ifnet_model_t, boolean_t);
+static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
+    const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
+
+static void ifnet_detacher_thread_func(void *, wait_result_t);
+static int ifnet_detacher_thread_cont(int);
 static void ifnet_detach_final(struct ifnet *);
 static void ifnet_detaching_enqueue(struct ifnet *);
 static struct ifnet *ifnet_detaching_dequeue(void);
 
+static void ifnet_start_thread_fn(void *, wait_result_t);
+static void ifnet_poll_thread_fn(void *, wait_result_t);
+static void ifnet_poll(struct ifnet *);
+
 static void ifp_src_route_copyout(struct ifnet *, struct route *);
 static void ifp_src_route_copyin(struct ifnet *, struct route *);
 #if INET6
@@ -355,6 +412,10 @@ static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
 #endif /* INET6 */
 
+static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
+static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
+static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
+
 /* The following are protected by dlil_ifnet_lock */
 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
 static u_int32_t ifnet_detaching_cnt;
@@ -363,6 +424,11 @@ static void *ifnet_delayed_run;    /* wait channel for detaching thread */
 extern void bpfdetach(struct ifnet*);
 extern void proto_input_run(void);
 
+extern uint32_t udp_count_opportunistic(unsigned int ifindex, 
+       u_int32_t flags);
+extern uint32_t tcp_count_opportunistic(unsigned int ifindex, 
+       u_int32_t flags);
+
 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
 
 #if DEBUG
@@ -370,28 +436,108 @@ static int dlil_verbose = 1;
 #else
 static int dlil_verbose = 0;
 #endif /* DEBUG */
-static int dlil_multithreaded_input = 1;
-static int cur_dlil_input_threads = 0;
 #if IFNET_INPUT_SANITY_CHK
-static int dlil_lo_input_mbuf_count = 0;
 /* sanity checking of input packet lists received */
-static int dlil_input_sanity_check = 0;
-#endif
+static u_int32_t dlil_input_sanity_check = 0;
+#endif /* IFNET_INPUT_SANITY_CHK */
+/* rate limit debug messages */
+struct timespec dlil_dbgrate = { 1, 0 };
 
 SYSCTL_DECL(_net_link_generic_system);
 
-SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW,
-    &dlil_verbose, 0, "Log DLIL error messages");
-
-SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW,
-    &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input");
+SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
+
+#define        IF_SNDQ_MINLEN  32
+u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
+SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
+    sysctl_sndq_maxlen, "I", "Default transmit queue max length");
+
+#define        IF_RCVQ_MINLEN  32
+#define IF_RCVQ_MAXLEN 256
+u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
+SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
+    sysctl_rcvq_maxlen, "I", "Default receive queue max length");
+
+#define        IF_RXPOLL_DECAY 2               /* ilog2 of EWMA decay rate (4) */
+static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
+    "ilog2 of EWMA decay rate of avg inbound packets");
+
+#define        IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
+static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
+SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
+    "input poll mode freeze time");
+
+#define        IF_RXPOLL_SAMPLETIME    (10ULL * 1000 * 1000)   /* 10 ms */
+static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
+SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
+    "input poll sampling time");
+
+#define        IF_RXPOLL_INTERVAL_TIME (1ULL * 1000 * 1000)    /* 1 ms */
+static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVAL_TIME;
+SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
+    "input poll interval (time)");
+
+#define        IF_RXPOLL_INTERVAL_PKTS 0                       /* 0 (disabled) */
+static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
+    IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
+
+#define        IF_RXPOLL_WLOWAT                5
+static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat, IF_RXPOLL_WLOWAT,
+    "input poll wakeup low watermark");
+
+#define        IF_RXPOLL_WHIWAT                100
+static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat, IF_RXPOLL_WHIWAT,
+    "input poll wakeup high watermark");
+
+static u_int32_t if_rxpoll_max = 0;                    /* 0 (automatic) */
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
+    "max packets per poll call");
+
+static u_int32_t if_rxpoll = 1;
+SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
+    sysctl_rxpoll, "I", "enable opportunistic input polling");
+
+u_int32_t if_bw_smoothing_val = 3;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, if_bw_smoothing_val,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_smoothing_val, 0, "");
+
+u_int32_t if_bw_measure_size = 10;
+SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_measure_size, 0, "");
+
+static u_int32_t cur_dlil_input_threads = 0;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0,
+    "Current number of DLIL input threads");
 
 #if IFNET_INPUT_SANITY_CHK
-SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
-    CTLFLAG_RW, &dlil_input_sanity_check , 0,
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0,
     "Turn on sanity checking in DLIL input");
-#endif
+#endif /* IFNET_INPUT_SANITY_CHK */
 
+static u_int32_t if_flowadv = 1;
+SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
+    "enable flow-advisory mechanism");
+
+unsigned int net_rxpoll = 1;
 unsigned int net_affinity = 1;
 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
 
@@ -399,10 +545,47 @@ extern u_int32_t  inject_buckets;
 
 static lck_grp_attr_t  *dlil_grp_attributes = NULL;
 static lck_attr_t      *dlil_lck_attributes = NULL;
-static lck_grp_t       *dlil_input_lock_grp = NULL;
 
 #define PROTO_HASH_SLOTS       0x5
 
+#define        DLIL_INPUT_CHECK(m, ifp) {                                      \
+       struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
+       if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
+           !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
+               panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
+               /* NOTREACHED */                                        \
+       }                                                               \
+}
+
+#define        DLIL_EWMA(old, new, decay) do {                                 \
+       u_int32_t _avg;                                                 \
+       if ((_avg = (old)) > 0)                                         \
+               _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
+       else                                                            \
+               _avg = (new);                                           \
+       (old) = _avg;                                                   \
+} while (0)
+
+#define        MBPS    (1ULL * 1000 * 1000)
+#define        GBPS    (MBPS * 1000)
+
+struct rxpoll_time_tbl {
+       u_int64_t       speed;          /* downlink speed */
+       u_int32_t       plowat;         /* packets low watermark */
+       u_int32_t       phiwat;         /* packets high watermark */
+       u_int32_t       blowat;         /* bytes low watermark */
+       u_int32_t       bhiwat;         /* bytes high watermark */
+};
+
+static struct rxpoll_time_tbl rxpoll_tbl[] = {
+       {  10 * MBPS,   2,      8,      (1 * 1024),     (6 * 1024)      },
+       { 100 * MBPS,   10,     40,     (4 * 1024),     (64 * 1024)     },
+       {   1 * GBPS,   10,     40,     (4 * 1024),     (64 * 1024)     },
+       {  10 * GBPS,   10,     40,     (4 * 1024),     (64 * 1024)     },
+       { 100 * GBPS,   10,     40,     (4 * 1024),     (64 * 1024)     },
+       { 0, 0, 0, 0, 0 }
+};
+
 /*
  * Internal functions.
  */
@@ -634,38 +817,154 @@ dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
        dlil_event_internal(ifp, &ev_msg);
 }
 
+__private_extern__ int
+dlil_alloc_local_stats(struct ifnet *ifp)
+{
+       int ret = EINVAL;
+       void *buf, *base, **pbuf;
+
+       if (ifp == NULL)
+               goto end;
+
+       if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
+               /* allocate tcpstat_local structure */
+               buf = zalloc(dlif_tcpstat_zone);
+               if (buf == NULL) {
+                       ret = ENOMEM;
+                       goto end;
+               }
+               bzero(buf, dlif_tcpstat_bufsize);
+
+               /* Get the 64-bit aligned base address for this object */
+               base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
+                   sizeof (u_int64_t));
+               VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
+                   ((intptr_t)buf + dlif_tcpstat_bufsize));
+
+               /*
+                * Wind back a pointer size from the aligned base and
+                * save the original address so we can free it later.
+                */
+               pbuf = (void **)((intptr_t)base - sizeof (void *));
+               *pbuf = buf;
+               ifp->if_tcp_stat = base;
+
+               /* allocate udpstat_local structure */
+               buf = zalloc(dlif_udpstat_zone);
+               if (buf == NULL) {
+                       ret = ENOMEM;
+                       goto end;
+               }
+               bzero(buf, dlif_udpstat_bufsize);
+
+               /* Get the 64-bit aligned base address for this object */
+               base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
+                   sizeof (u_int64_t));
+               VERIFY(((intptr_t)base + dlif_udpstat_size) <=
+                   ((intptr_t)buf + dlif_udpstat_bufsize));
+
+               /*
+                * Wind back a pointer size from the aligned base and
+                * save the original address so we can free it later.
+                */
+               pbuf = (void **)((intptr_t)base - sizeof (void *));
+               *pbuf = buf;
+               ifp->if_udp_stat = base;
+
+               VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof (u_int64_t)) &&
+                   IS_P2ALIGNED(ifp->if_udp_stat, sizeof (u_int64_t)));
+
+               ret = 0;
+       }
+
+end:
+       if (ret != 0) {
+               if (ifp->if_tcp_stat != NULL) {
+                       pbuf = (void **)
+                           ((intptr_t)ifp->if_tcp_stat - sizeof (void *));
+                       zfree(dlif_tcpstat_zone, *pbuf);
+                       ifp->if_tcp_stat = NULL;
+               }
+               if (ifp->if_udp_stat != NULL) {
+                       pbuf = (void **)
+                           ((intptr_t)ifp->if_udp_stat - sizeof (void *));
+                       zfree(dlif_udpstat_zone, *pbuf);
+                       ifp->if_udp_stat = NULL;
+               }
+       }
+
+       return (ret);
+}
+
 static int
-dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread)
+dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
 {
+       thread_continue_t func;
+       u_int32_t limit;
        int error;
 
-       bzero(inputthread, sizeof(*inputthread));
-       /* loopback ifp may not be configured at dlil_init time. */
-       if (ifp == lo_ifp) {
-               (void) strlcat(inputthread->input_name,
-                   "dlil_input_main_thread_mtx", DLIL_THREADNAME_LEN);
+       /* NULL ifp indicates the main input thread, called at dlil_init time */
+       if (ifp == NULL) {
+               func = dlil_main_input_thread_func;
+               VERIFY(inp == dlil_main_input_thread);
+               (void) strlcat(inp->input_name,
+                   "main_input", DLIL_THREADNAME_LEN);
+       } else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
+               func = dlil_rxpoll_input_thread_func;
+               VERIFY(inp != dlil_main_input_thread);
+               (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
+                   "%s%d_input_poll", ifp->if_name, ifp->if_unit);
        } else {
-               (void) snprintf(inputthread->input_name, DLIL_THREADNAME_LEN,
-                   "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit);
+               func = dlil_input_thread_func;
+               VERIFY(inp != dlil_main_input_thread);
+               (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
+                   "%s%d_input", ifp->if_name, ifp->if_unit);
        }
+       VERIFY(inp->input_thr == THREAD_NULL);
 
-       inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name,
-           dlil_grp_attributes);
-       lck_mtx_init(&inputthread->input_lck, inputthread->lck_grp,
-           dlil_lck_attributes);
+       inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes);
+       lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes);
+
+       inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
+       inp->ifp = ifp;         /* NULL for main input thread */
+
+       net_timerclear(&inp->mode_holdtime);
+       net_timerclear(&inp->mode_lasttime);
+       net_timerclear(&inp->sample_holdtime);
+       net_timerclear(&inp->sample_lasttime);
+       net_timerclear(&inp->dbg_lasttime);
+
+       /*
+        * For interfaces that support opportunistic polling, set the
+        * low and high watermarks for outstanding inbound packets/bytes.
+        * Also define freeze times for transitioning between modes
+        * and updating the average.
+        */
+       if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
+               limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
+               dlil_rxpoll_calc_limits(inp);
+       } else {
+               limit = (u_int32_t)-1;
+       }
+
+       _qinit(&inp->rcvq_pkts, Q_DROPTAIL, limit);
+       if (inp == dlil_main_input_thread) {
+               struct dlil_main_threading_info *inpm =
+                   (struct dlil_main_threading_info *)inp;
+               _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit);
+       }
 
-       error= kernel_thread_start((thread_continue_t)dlil_input_thread_func,
-           inputthread, &inputthread->input_thread);
-       if (error == 0) {
-               ml_thread_policy(inputthread->input_thread, MACHINE_GROUP,
+       error = kernel_thread_start(func, inp, &inp->input_thr);
+       if (error == KERN_SUCCESS) {
+               ml_thread_policy(inp->input_thr, MACHINE_GROUP,
                    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR));
                /*
-                * Except for the loopback dlil input thread, we create
-                * an affinity set so that the matching workloop thread
-                * can be scheduled on the same processor set.
+                * We create an affinity set so that the matching workloop
+                * thread or the starter thread (for loopback) can be
+                * scheduled on the same processor set as the input thread.
                 */
-               if (net_affinity && inputthread != dlil_lo_thread_ptr) {
-                       struct thread *tp = inputthread->input_thread;
+               if (net_affinity) {
+                       struct thread *tp = inp->input_thr;
                        u_int32_t tag;
                        /*
                         * Randomize to reduce the probability
@@ -674,23 +973,79 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread)
                        read_random(&tag, sizeof (tag));
                        if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
                                thread_reference(tp);
-                               inputthread->tag = tag;
-                               inputthread->net_affinity = TRUE;
+                               inp->tag = tag;
+                               inp->net_affinity = TRUE;
                        }
                }
+       } else if (inp == dlil_main_input_thread) {
+               panic_plain("%s: couldn't create main input thread", __func__);
+               /* NOTREACHED */
        } else {
-               panic("%s: couldn't create thread", __func__);
+               panic_plain("%s: couldn't create %s%d input thread", __func__,
+                   ifp->if_name, ifp->if_unit);
                /* NOTREACHED */
        }
        OSAddAtomic(1, &cur_dlil_input_threads);
-#if DLIL_DEBUG
-       printf("%s: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n",
-           __func__, inputthread, inputthread->input_thread,
-           dlil_multithreaded_input, cur_dlil_input_threads);
-#endif
+
        return (error);
 }
 
+static void
+dlil_terminate_input_thread(struct dlil_threading_info *inp)
+{
+       struct ifnet *ifp;
+
+       VERIFY(current_thread() == inp->input_thr);
+       VERIFY(inp != dlil_main_input_thread);
+
+       OSAddAtomic(-1, &cur_dlil_input_threads);
+
+       lck_mtx_destroy(&inp->input_lck, inp->lck_grp);
+       lck_grp_free(inp->lck_grp);
+
+       inp->input_waiting = 0;
+       inp->wtot = 0;
+       bzero(inp->input_name, sizeof (inp->input_name));
+       ifp = inp->ifp;
+       inp->ifp = NULL;
+       VERIFY(qhead(&inp->rcvq_pkts) == NULL && qempty(&inp->rcvq_pkts));
+       qlimit(&inp->rcvq_pkts) = 0;
+       bzero(&inp->stats, sizeof (inp->stats));
+
+       VERIFY(!inp->net_affinity);
+       inp->input_thr = THREAD_NULL;
+       VERIFY(inp->wloop_thr == THREAD_NULL);
+       VERIFY(inp->poll_thr == THREAD_NULL);
+       VERIFY(inp->tag == 0);
+
+       inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
+       bzero(&inp->tstats, sizeof (inp->tstats));
+       bzero(&inp->pstats, sizeof (inp->pstats));
+       bzero(&inp->sstats, sizeof (inp->sstats));
+
+       net_timerclear(&inp->mode_holdtime);
+       net_timerclear(&inp->mode_lasttime);
+       net_timerclear(&inp->sample_holdtime);
+       net_timerclear(&inp->sample_lasttime);
+       net_timerclear(&inp->dbg_lasttime);
+
+#if IFNET_INPUT_SANITY_CHK
+       inp->input_mbuf_cnt = 0;
+#endif /* IFNET_INPUT_SANITY_CHK */
+
+       if (dlil_verbose) {
+               printf("%s%d: input thread terminated\n",
+                   ifp->if_name, ifp->if_unit);
+       }
+
+       /* for the extra refcnt from kernel_thread_start() */
+       thread_deallocate(current_thread());
+
+       /* this is the end */
+       thread_terminate(current_thread());
+       /* NOTREACHED */
+}
+
 static kern_return_t
 dlil_affinity_set(struct thread *tp, u_int32_t tag)
 {
@@ -721,6 +1076,7 @@ dlil_init(void)
        IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
        IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
        IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
+       IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
 
        IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
        IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
@@ -733,19 +1089,7 @@ dlil_init(void)
        IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
        IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
        IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
-
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkpackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkbytes);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkpackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkbytes);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivipackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivibytes);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovipackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovibytes);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivopackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivobytes);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovopackets);
-       IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovobytes);
+       IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
 
        /*
         * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
@@ -765,10 +1109,13 @@ dlil_init(void)
         * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
         */
        _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
+       _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
 
        PE_parse_boot_argn("net_affinity", &net_affinity,
            sizeof (net_affinity));
 
+       PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof (net_rxpoll));
+
        PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref));
 
        PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug));
@@ -781,7 +1128,8 @@ dlil_init(void)
        dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize,
            0, DLIF_ZONE_NAME);
        if (dlif_zone == NULL) {
-               panic("%s: failed allocating %s", __func__, DLIF_ZONE_NAME);
+               panic_plain("%s: failed allocating %s", __func__,
+                   DLIF_ZONE_NAME);
                /* NOTREACHED */
        }
        zone_change(dlif_zone, Z_EXPAND, TRUE);
@@ -791,28 +1139,18 @@ dlil_init(void)
        dlif_filt_zone = zinit(dlif_filt_size,
            DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME);
        if (dlif_filt_zone == NULL) {
-               panic("%s: failed allocating %s", __func__,
+               panic_plain("%s: failed allocating %s", __func__,
                    DLIF_FILT_ZONE_NAME);
                /* NOTREACHED */
        }
        zone_change(dlif_filt_zone, Z_EXPAND, TRUE);
        zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE);
 
-       dlif_inp_size = sizeof (struct dlil_threading_info);
-       dlif_inp_zone = zinit(dlif_inp_size,
-           DLIF_INP_ZONE_MAX * dlif_inp_size, 0, DLIF_INP_ZONE_NAME);
-       if (dlif_inp_zone == NULL) {
-               panic("%s: failed allocating %s", __func__, DLIF_INP_ZONE_NAME);
-               /* NOTREACHED */
-       }
-       zone_change(dlif_inp_zone, Z_EXPAND, TRUE);
-       zone_change(dlif_inp_zone, Z_CALLERACCT, FALSE);
-
        dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS;
        dlif_phash_zone = zinit(dlif_phash_size,
            DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME);
        if (dlif_phash_zone == NULL) {
-               panic("%s: failed allocating %s", __func__,
+               panic_plain("%s: failed allocating %s", __func__,
                    DLIF_PHASH_ZONE_NAME);
                /* NOTREACHED */
        }
@@ -823,13 +1161,47 @@ dlil_init(void)
        dlif_proto_zone = zinit(dlif_proto_size,
            DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME);
        if (dlif_proto_zone == NULL) {
-               panic("%s: failed allocating %s", __func__,
+               panic_plain("%s: failed allocating %s", __func__,
                    DLIF_PROTO_ZONE_NAME);
                /* NOTREACHED */
        }
        zone_change(dlif_proto_zone, Z_EXPAND, TRUE);
        zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE);
 
+       dlif_tcpstat_size = sizeof (struct tcpstat_local);
+       /* Enforce 64-bit alignment for tcpstat_local structure */
+       dlif_tcpstat_bufsize =
+           dlif_tcpstat_size + sizeof (void *) + sizeof (u_int64_t);
+       dlif_tcpstat_bufsize =
+           P2ROUNDUP(dlif_tcpstat_bufsize, sizeof (u_int64_t));
+       dlif_tcpstat_zone = zinit(dlif_tcpstat_bufsize,
+           DLIF_TCPSTAT_ZONE_MAX * dlif_tcpstat_bufsize, 0,
+           DLIF_TCPSTAT_ZONE_NAME);
+       if (dlif_tcpstat_zone == NULL) {
+               panic_plain("%s: failed allocating %s", __func__,
+                   DLIF_TCPSTAT_ZONE_NAME);
+               /* NOTREACHED */
+       }
+       zone_change(dlif_tcpstat_zone, Z_EXPAND, TRUE);
+       zone_change(dlif_tcpstat_zone, Z_CALLERACCT, FALSE);
+
+       dlif_udpstat_size = sizeof (struct udpstat_local);
+       /* Enforce 64-bit alignment for udpstat_local structure */
+       dlif_udpstat_bufsize =
+           dlif_udpstat_size + sizeof (void *) + sizeof (u_int64_t);
+       dlif_udpstat_bufsize =
+           P2ROUNDUP(dlif_udpstat_bufsize, sizeof (u_int64_t));
+       dlif_udpstat_zone = zinit(dlif_udpstat_bufsize,
+           DLIF_TCPSTAT_ZONE_MAX * dlif_udpstat_bufsize, 0,
+           DLIF_UDPSTAT_ZONE_NAME);
+       if (dlif_udpstat_zone == NULL) {
+               panic_plain("%s: failed allocating %s", __func__,
+                   DLIF_UDPSTAT_ZONE_NAME);
+               /* NOTREACHED */
+       }
+       zone_change(dlif_udpstat_zone, Z_EXPAND, TRUE);
+       zone_change(dlif_udpstat_zone, Z_CALLERACCT, FALSE);
+
        ifnet_llreach_init();
 
        TAILQ_INIT(&dlil_ifnet_head);
@@ -839,13 +1211,15 @@ dlil_init(void)
        /* Setup the lock groups we will use */
        dlil_grp_attributes = lck_grp_attr_alloc_init();
 
-       dlil_lock_group = lck_grp_alloc_init("dlil internal locks",
+       dlil_lock_group = lck_grp_alloc_init("DLIL internal locks",
            dlil_grp_attributes);
        ifnet_lock_group = lck_grp_alloc_init("ifnet locks",
            dlil_grp_attributes);
        ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock",
            dlil_grp_attributes);
-       dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock",
+       ifnet_rcv_lock_group = lck_grp_alloc_init("ifnet rcv locks",
+           dlil_grp_attributes);
+       ifnet_snd_lock_group = lck_grp_alloc_init("ifnet snd locks",
            dlil_grp_attributes);
 
        /* Setup the lock attributes we will use */
@@ -857,20 +1231,21 @@ dlil_init(void)
            dlil_lck_attributes);
        lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes);
 
+       ifnet_fc_init();
+
        lck_attr_free(dlil_lck_attributes);
        dlil_lck_attributes = NULL;
 
        ifa_init();
-
        /*
-        * Create and start up the first dlil input thread once everything
-        * is initialized.
+        * Create and start up the main DLIL input thread and the interface
+        * detacher threads once everything is initialized.
         */
-       dlil_create_input_thread(lo_ifp, dlil_lo_thread_ptr);
+       dlil_create_input_thread(NULL, dlil_main_input_thread);
 
-       if (kernel_thread_start((thread_continue_t)ifnet_delayed_thread_func,
-           NULL, &thread) != 0) {
-               panic("%s: couldn't create detach thread", __func__);
+       if (kernel_thread_start(ifnet_detacher_thread_func,
+           NULL, &thread) != KERN_SUCCESS) {
+               panic_plain("%s: couldn't create detacher thread", __func__);
                /* NOTREACHED */
        }
        thread_deallocate(thread);
@@ -879,6 +1254,12 @@ dlil_init(void)
        /* Initialize the packet filter */
        pfinit();
 #endif /* PF */
+
+       /* Initialize queue algorithms */
+       classq_init();
+
+       /* Initialize packet schedulers */
+       pktsched_init();
 }
 
 static void
@@ -1080,276 +1461,1157 @@ dlil_detach_filter(interface_filter_t filter)
        dlil_detach_filter_internal(filter, 0);
 }
 
+/*
+ * Main input thread:
+ *
+ *   a) handles all inbound packets for lo0
+ *   b) handles all inbound packets for interfaces with no dedicated
+ *     input thread (e.g. anything but Ethernet/PDP or those that support
+ *     opportunistic polling.)
+ *   c) protocol registrations
+ *   d) packet injections
+ */
 static void
-dlil_input_thread_func(struct dlil_threading_info *inputthread)
+dlil_main_input_thread_func(void *v, wait_result_t w)
 {
+#pragma unused(w)
+       struct dlil_main_threading_info *inpm = v;
+       struct dlil_threading_info *inp = v;
+
+       VERIFY(inp == dlil_main_input_thread);
+       VERIFY(inp->ifp == NULL);
+       VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
+
        while (1) {
                struct mbuf *m = NULL, *m_loop = NULL;
-#if IFNET_INPUT_SANITY_CHK
-               int             loop_cnt = 0, mbuf_cnt;
-               int             count;
-               struct mbuf *m1;
-#endif /* IFNET_INPUT_SANITY_CHK */
+               u_int32_t m_cnt, m_cnt_loop;
+               boolean_t proto_req;
 
-               lck_mtx_lock_spin(&inputthread->input_lck);
+               lck_mtx_lock_spin(&inp->input_lck);
 
                /* Wait until there is work to be done */
-               while (!(inputthread->input_waiting & ~DLIL_INPUT_RUNNING)) {
-                       inputthread->input_waiting &= ~DLIL_INPUT_RUNNING;
-                       msleep(&inputthread->input_waiting,
-                           &inputthread->input_lck, 0,
-                           inputthread->input_name, 0);
+               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
+                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+                       (void) msleep(&inp->input_waiting, &inp->input_lck,
+                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
                }
 
-               lck_mtx_assert(&inputthread->input_lck, LCK_MTX_ASSERT_OWNED);
+               inp->input_waiting |= DLIL_INPUT_RUNNING;
+               inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
-               m = inputthread->mbuf_head;
-               inputthread->mbuf_head = NULL;
-               inputthread->mbuf_tail = NULL;
+               /* Main input thread cannot be terminated */
+               VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
 
-               if (inputthread->input_waiting & DLIL_INPUT_TERMINATE) {
-                       lck_mtx_unlock(&inputthread->input_lck);
+               proto_req = (inp->input_waiting &
+                   (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
 
-                       if (m != NULL)
-                               mbuf_freem_list(m);
+               /* Packets for non-dedicated interfaces other than lo0 */
+               m_cnt = qlen(&inp->rcvq_pkts);
+               m = _getq_all(&inp->rcvq_pkts);
 
-                       OSAddAtomic(-1, &cur_dlil_input_threads);
+               /* Packets exclusive for lo0 */
+               m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
+               m_loop = _getq_all(&inpm->lo_rcvq_pkts);
 
-                       lck_mtx_destroy(&inputthread->input_lck,
-                           inputthread->lck_grp);
-                       lck_grp_free(inputthread->lck_grp);
+               inp->wtot = 0;
 
-                       zfree(dlif_inp_zone, inputthread);
+               lck_mtx_unlock(&inp->input_lck);
 
-                       /* for the extra refcnt from kernel_thread_start() */
-                       thread_deallocate(current_thread());
+               /*
+               * NOTE warning %%% attention !!!!
+               * We should think about putting some thread starvation
+               * safeguards if we deal with long chains of packets.
+               */
+               if (m_loop != NULL)
+                       dlil_input_packet_list_extended(lo_ifp, m_loop,
+                           m_cnt_loop, inp->mode);
 
-                       /* this is the end */
-                       thread_terminate(current_thread());
-                       /* NOTREACHED */
-                       return;
-               }
+               if (m != NULL)
+                       dlil_input_packet_list_extended(NULL, m,
+                           m_cnt, inp->mode);
+
+               if (proto_req)
+                       proto_input_run();
+       }
+
+       /* NOTREACHED */
+       VERIFY(0);      /* we should never get here */
+}
+
+/*
+ * Input thread for interfaces with legacy input model.
+ */
+static void
+dlil_input_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(w)
+       struct dlil_threading_info *inp = v;
+       struct ifnet *ifp = inp->ifp;
+
+       VERIFY(inp != dlil_main_input_thread);
+       VERIFY(ifp != NULL);
+       VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll);
+       VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
 
-               inputthread->input_waiting |= DLIL_INPUT_RUNNING;
-               inputthread->input_waiting &= ~DLIL_INPUT_WAITING;
+       while (1) {
+               struct mbuf *m = NULL;
+               u_int32_t m_cnt;
+
+               lck_mtx_lock_spin(&inp->input_lck);
 
-               if (inputthread == dlil_lo_thread_ptr) {
-                       m_loop = dlil_lo_input_mbuf_head;
-                       dlil_lo_input_mbuf_head = NULL;
-                       dlil_lo_input_mbuf_tail = NULL;
+               /* Wait until there is work to be done */
+               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
+                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+                       (void) msleep(&inp->input_waiting, &inp->input_lck,
+                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
                }
 
-#if IFNET_INPUT_SANITY_CHK
-               if (dlil_input_sanity_check != 0) {
-                       mbuf_cnt = inputthread->mbuf_count;
-                       inputthread->mbuf_count = 0;
-                       if (inputthread == dlil_lo_thread_ptr) {
-                               loop_cnt = dlil_lo_input_mbuf_count;
-                               dlil_lo_input_mbuf_count = 0;
-                       }
+               inp->input_waiting |= DLIL_INPUT_RUNNING;
+               inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
-                       lck_mtx_unlock(&inputthread->input_lck);
+               /*
+                * Protocol registration and injection must always use
+                * the main input thread; in theory the latter can utilize
+                * the corresponding input thread where the packet arrived
+                * on, but that requires our knowing the interface in advance
+                * (and the benefits might not worth the trouble.)
+                */
+               VERIFY(!(inp->input_waiting &
+                   (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
 
-                       for (m1 = m, count = 0; m1; m1 = mbuf_nextpkt(m1)) {
-                               count++;
-                       }
-                       if (count != mbuf_cnt) {
-                               panic("%s - thread=%p reg. loop queue "
-                                   "has %d packets, should have %d\n",
-                                   __func__, inputthread, count, mbuf_cnt);
-                               /* NOTREACHED */
-                       }
+               /* Packets for this interface */
+               m_cnt = qlen(&inp->rcvq_pkts);
+               m = _getq_all(&inp->rcvq_pkts);
 
-                       if (inputthread == dlil_lo_thread_ptr) {
-                               for (m1 = m_loop, count = 0; m1;
-                                   m1 = mbuf_nextpkt(m1)) {
-                                       count++;
-                               }
-                               if (count != loop_cnt) {
-                                       panic("%s - thread=%p loop queue "
-                                           "has %d packets, should have %d\n",
-                                           __func__, inputthread, count,
-                                           loop_cnt);
-                                       /* NOTREACHED */
-                               }
-                       }
-               } else
-#endif /* IFNET_INPUT_SANITY_CHK */
-               {
-                       lck_mtx_unlock(&inputthread->input_lck);
+               if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
+                       lck_mtx_unlock(&inp->input_lck);
+
+                       /* Free up pending packets */
+                       if (m != NULL)
+                               mbuf_freem_list(m);
+
+                       dlil_terminate_input_thread(inp);
+                       /* NOTREACHED */
+                       return;
                }
 
+               inp->wtot = 0;
+
+               dlil_input_stats_sync(ifp, inp);
+
+               lck_mtx_unlock(&inp->input_lck);
 
                /*
                * NOTE warning %%% attention !!!!
                * We should think about putting some thread starvation
                * safeguards if we deal with long chains of packets.
                */
-               if (m_loop) {
-                       if (inputthread == dlil_lo_thread_ptr) {
-                               dlil_input_packet_list(lo_ifp, m_loop);
-                       }
-#if IFNET_INPUT_SANITY_CHK
-                       else {
-                               panic("%s - thread=%p loop queue has %d "
-                                   "packets, should have none!\n", __func__,
-                                   inputthread, loop_cnt);
-                               /* NOTREACHED */
-                       }
-#endif /* IFNET_INPUT_SANITY_CHK */
-               }
-
                if (m != NULL)
-                       dlil_input_packet_list(0, m);
-
-               lck_mtx_lock_spin(&inputthread->input_lck);
-
-               if (inputthread->input_waiting &
-                   (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER))  {
-                       lck_mtx_unlock(&inputthread->input_lck);
-                       proto_input_run();
-               } else {
-                       lck_mtx_unlock(&inputthread->input_lck);
-               }
+                       dlil_input_packet_list_extended(NULL, m,
+                           m_cnt, inp->mode);
        }
+
+       /* NOTREACHED */
+       VERIFY(0);      /* we should never get here */
 }
 
-errno_t
-ifnet_input(ifnet_t ifp, mbuf_t m_head,
-    const struct ifnet_stat_increment_param *stats)
+/*
+ * Input thread for interfaces with opportunistic polling input model.
+ */
+static void
+dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
 {
-       struct thread *tp = current_thread();
-       mbuf_t          m_tail;
-       struct dlil_threading_info *inp;
-#if IFNET_INPUT_SANITY_CHK
-       u_int32_t       pkt_count = 0;
-#endif /* IFNET_INPUT_SANITY_CHK */
+#pragma unused(w)
+       struct dlil_threading_info *inp = v;
+       struct ifnet *ifp = inp->ifp;
+       struct timespec ts;
 
-       if (ifp == NULL || m_head == NULL) {
-               if (m_head != NULL)
-                       mbuf_freem_list(m_head);
-               return (EINVAL);
-       }
+       VERIFY(inp != dlil_main_input_thread);
+       VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
 
-       m_tail = m_head;
        while (1) {
-#if IFNET_INPUT_SANITY_CHK
-               if (dlil_input_sanity_check != 0) {
-                       ifnet_t rcvif;
+               struct mbuf *m = NULL;
+               u_int32_t m_cnt, m_size, poll_req = 0;
+               ifnet_model_t mode;
+               struct timespec now, delta;
 
-                       rcvif = mbuf_pkthdr_rcvif(m_tail);
-                       pkt_count++;
+               lck_mtx_lock_spin(&inp->input_lck);
 
-                       if (rcvif == NULL ||
-                           (ifp->if_type != IFT_LOOP && rcvif != ifp) ||
-                           !(mbuf_flags(m_head) & MBUF_PKTHDR)) {
-                               panic("%s - invalid mbuf %p\n",
-                                   __func__, m_tail);
-                               /* NOTREACHED */
-                       }
+               /* Link parameters changed? */
+               if (ifp->if_poll_update != 0) {
+                       ifp->if_poll_update = 0;
+                       dlil_rxpoll_calc_limits(inp);
                }
-#endif /* IFNET_INPUT_SANITY_CHK */
-               if (mbuf_nextpkt(m_tail) == NULL)
-                       break;
-               m_tail = mbuf_nextpkt(m_tail);
-       }
 
-       inp = ifp->if_input_thread;
+               /* Current operating mode */
+               mode = inp->mode;
 
-       if (dlil_multithreaded_input == 0 || inp == NULL)
-               inp = dlil_lo_thread_ptr;
-
-       /*
-        * If there is a matching dlil input thread associated with an
-        * affinity set, associate this workloop thread with the same set.
-        * We will only do this once.
-        */
-       lck_mtx_lock_spin(&inp->input_lck);
-       if (inp->net_affinity && inp->workloop_thread == NULL) {
-               u_int32_t tag = inp->tag;
-               inp->workloop_thread = tp;
-               lck_mtx_unlock(&inp->input_lck);
+               /* Wait until there is work to be done */
+               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING) &&
+                   qempty(&inp->rcvq_pkts)) {
+                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+                       (void) msleep(&inp->input_waiting, &inp->input_lck,
+                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
+               }
 
-               /* Associated the current thread with the new affinity tag */
-               (void) dlil_affinity_set(tp, tag);
+               inp->input_waiting |= DLIL_INPUT_RUNNING;
+               inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
                /*
-                * Take a reference on the workloop (current) thread; during
-                * detach, we will need to refer to it in order ot tear down
-                * its affinity.
+                * Protocol registration and injection must always use
+                * the main input thread; in theory the latter can utilize
+                * the corresponding input thread where the packet arrived
+                * on, but that requires our knowing the interface in advance
+                * (and the benefits might not worth the trouble.)
                 */
-               thread_reference(tp);
-               lck_mtx_lock_spin(&inp->input_lck);
-       }
-
-        /* WARNING
-        * Because of loopbacked multicast we cannot stuff the ifp in
-        * the rcvif of the packet header: loopback has its own dlil
-        * input queue
-        */
+               VERIFY(!(inp->input_waiting &
+                   (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
 
-       if (inp == dlil_lo_thread_ptr && ifp->if_type == IFT_LOOP) {
-               if (dlil_lo_input_mbuf_head == NULL)
-                       dlil_lo_input_mbuf_head = m_head;
-               else if (dlil_lo_input_mbuf_tail != NULL)
-                       dlil_lo_input_mbuf_tail->m_nextpkt = m_head;
-               dlil_lo_input_mbuf_tail = m_tail;
-#if IFNET_INPUT_SANITY_CHK
-               if (dlil_input_sanity_check != 0) {
-                       dlil_lo_input_mbuf_count += pkt_count;
-                       inp->input_mbuf_cnt += pkt_count;
-                       inp->input_wake_cnt++;
+               if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
+                       /* Free up pending packets */
+                       _flushq(&inp->rcvq_pkts);
+                       lck_mtx_unlock(&inp->input_lck);
 
-                       lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
+                       dlil_terminate_input_thread(inp);
+                       /* NOTREACHED */
+                       return;
                }
-#endif
-       } else {
-               if (inp->mbuf_head == NULL)
-                       inp->mbuf_head = m_head;
-               else if (inp->mbuf_tail != NULL)
-                       inp->mbuf_tail->m_nextpkt = m_head;
-               inp->mbuf_tail = m_tail;
-#if IFNET_INPUT_SANITY_CHK
-               if (dlil_input_sanity_check != 0) {
-                       inp->mbuf_count += pkt_count;
-                       inp->input_mbuf_cnt += pkt_count;
-                       inp->input_wake_cnt++;
 
-                       lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
-               }
-#endif
-       }
+               /* Total count of all packets */
+               m_cnt = qlen(&inp->rcvq_pkts);
+
+               /* Total bytes of all packets */
+               m_size = qsize(&inp->rcvq_pkts);
+
+               /* Packets for this interface */
+               m = _getq_all(&inp->rcvq_pkts);
+               VERIFY(m != NULL || m_cnt == 0);
+
+               nanouptime(&now);
+               if (!net_timerisset(&inp->sample_lasttime))
+                       *(&inp->sample_lasttime) = *(&now);
+
+               net_timersub(&now, &inp->sample_lasttime, &delta);
+               if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) {
+                       u_int32_t ptot, btot;
+
+                       /* Accumulate statistics for current sampling */
+                       PKTCNTR_ADD(&inp->sstats, m_cnt, m_size);
+
+                       if (net_timercmp(&delta, &inp->sample_holdtime, <))
+                               goto skip;
+
+                       *(&inp->sample_lasttime) = *(&now);
+
+                       /* Calculate min/max of inbound bytes */
+                       btot = (u_int32_t)inp->sstats.bytes;
+                       if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot)
+                               inp->rxpoll_bmin = btot;
+                       if (btot > inp->rxpoll_bmax)
+                               inp->rxpoll_bmax = btot;
+
+                       /* Calculate EWMA of inbound bytes */
+                       DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay);
+
+                       /* Calculate min/max of inbound packets */
+                       ptot = (u_int32_t)inp->sstats.packets;
+                       if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot)
+                               inp->rxpoll_pmin = ptot;
+                       if (ptot > inp->rxpoll_pmax)
+                               inp->rxpoll_pmax = ptot;
+
+                       /* Calculate EWMA of inbound packets */
+                       DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay);
+
+                       /* Reset sampling statistics */
+                       PKTCNTR_CLEAR(&inp->sstats);
+
+                       /* Calculate EWMA of wakeup requests */
+                       DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay);
+                       inp->wtot = 0;
+
+                       if (dlil_verbose) {
+                               if (!net_timerisset(&inp->dbg_lasttime))
+                                       *(&inp->dbg_lasttime) = *(&now);
+                               net_timersub(&now, &inp->dbg_lasttime, &delta);
+                               if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
+                                       *(&inp->dbg_lasttime) = *(&now);
+                                       printf("%s%d: [%s] pkts avg %d max %d "
+                                           "limits [%d/%d], wreq avg %d "
+                                           "limits [%d/%d], bytes avg %d "
+                                           "limits [%d/%d]\n", ifp->if_name,
+                                           ifp->if_unit, (inp->mode ==
+                                           IFNET_MODEL_INPUT_POLL_ON) ?
+                                           "ON" : "OFF", inp->rxpoll_pavg,
+                                           inp->rxpoll_pmax,
+                                           inp->rxpoll_plowat,
+                                           inp->rxpoll_phiwat,
+                                           inp->rxpoll_wavg,
+                                           inp->rxpoll_wlowat,
+                                           inp->rxpoll_whiwat,
+                                           inp->rxpoll_bavg,
+                                           inp->rxpoll_blowat,
+                                           inp->rxpoll_bhiwat);
+                               }
+                       }
 
-       inp->input_waiting |= DLIL_INPUT_WAITING;
-       if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) {
-               wakeup((caddr_t)&inp->input_waiting);
-       }
-       lck_mtx_unlock(&inp->input_lck);
+                       /* Perform mode transition, if necessary */
+                       if (!net_timerisset(&inp->mode_lasttime))
+                               *(&inp->mode_lasttime) = *(&now);
+
+                       net_timersub(&now, &inp->mode_lasttime, &delta);
+                       if (net_timercmp(&delta, &inp->mode_holdtime, <))
+                               goto skip;
+
+                       if (inp->rxpoll_pavg <= inp->rxpoll_plowat &&
+                           inp->rxpoll_bavg <= inp->rxpoll_blowat &&
+                           inp->rxpoll_wavg <= inp->rxpoll_wlowat &&
+                           inp->mode != IFNET_MODEL_INPUT_POLL_OFF) {
+                               mode = IFNET_MODEL_INPUT_POLL_OFF;
+                       } else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat &&
+                           (inp->rxpoll_bavg >= inp->rxpoll_bhiwat ||
+                           inp->rxpoll_wavg >= inp->rxpoll_whiwat) &&
+                           inp->mode != IFNET_MODEL_INPUT_POLL_ON) {
+                               mode = IFNET_MODEL_INPUT_POLL_ON;
+                       }
 
-       if (stats) {
-               atomic_add_64(&ifp->if_data.ifi_ipackets, stats->packets_in);
-               atomic_add_64(&ifp->if_data.ifi_ibytes, stats->bytes_in);
-               atomic_add_64(&ifp->if_data.ifi_ierrors, stats->errors_in);
+                       if (mode != inp->mode) {
+                               inp->mode = mode;
+                               *(&inp->mode_lasttime) = *(&now);
+                               poll_req++;
+                       }
+               }
+skip:
+               dlil_input_stats_sync(ifp, inp);
 
-               atomic_add_64(&ifp->if_data.ifi_opackets, stats->packets_out);
-               atomic_add_64(&ifp->if_data.ifi_obytes, stats->bytes_out);
-               atomic_add_64(&ifp->if_data.ifi_oerrors, stats->errors_out);
+               lck_mtx_unlock(&inp->input_lck);
 
-               atomic_add_64(&ifp->if_data.ifi_collisions, stats->collisions);
-               atomic_add_64(&ifp->if_data.ifi_iqdrops, stats->dropped);
-       }
+               /*
+                * If there's a mode change and interface is still attached,
+                * perform a downcall to the driver for the new mode.  Also
+                * hold an IO refcnt on the interface to prevent it from
+                * being detached (will be release below.)
+                */
+               if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
+                       struct ifnet_model_params p = { mode, { 0 } };
+                       errno_t err;
+
+                       if (dlil_verbose) {
+                               printf("%s%d: polling is now %s, "
+                                   "pkts avg %d max %d limits [%d/%d], "
+                                   "wreq avg %d limits [%d/%d], "
+                                   "bytes avg %d limits [%d/%d]\n",
+                                   ifp->if_name, ifp->if_unit,
+                                   (mode == IFNET_MODEL_INPUT_POLL_ON) ?
+                                   "ON" : "OFF", inp->rxpoll_pavg,
+                                   inp->rxpoll_pmax, inp->rxpoll_plowat,
+                                   inp->rxpoll_phiwat, inp->rxpoll_wavg,
+                                   inp->rxpoll_wlowat, inp->rxpoll_whiwat,
+                                   inp->rxpoll_bavg, inp->rxpoll_blowat,
+                                   inp->rxpoll_bhiwat);
+                       }
 
-       return (0);
-}
+                       if ((err = ((*ifp->if_input_ctl)(ifp,
+                           IFNET_CTL_SET_INPUT_MODEL, sizeof (p), &p))) != 0) {
+                               printf("%s%d: error setting polling mode "
+                                   "to %s (%d)\n", ifp->if_name, ifp->if_unit,
+                                   (mode == IFNET_MODEL_INPUT_POLL_ON) ?
+                                   "ON" : "OFF", err);
+                       }
 
-static int
-dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
-    char **frame_header_p, protocol_family_t protocol_family)
-{
-       struct ifnet_filter *filter;
+                       switch (mode) {
+                       case IFNET_MODEL_INPUT_POLL_OFF:
+                               ifnet_set_poll_cycle(ifp, NULL);
+                               inp->rxpoll_offreq++;
+                               if (err != 0)
+                                       inp->rxpoll_offerr++;
+                               break;
 
-       /*
-        * Pass the inbound packet to the interface filters
+                       case IFNET_MODEL_INPUT_POLL_ON:
+                               net_nsectimer(&if_rxpoll_interval_time, &ts);
+                               ifnet_set_poll_cycle(ifp, &ts);
+                               ifnet_poll(ifp);
+                               inp->rxpoll_onreq++;
+                               if (err != 0)
+                                       inp->rxpoll_onerr++;
+                               break;
+
+                       default:
+                               VERIFY(0);
+                               /* NOTREACHED */
+                       }
+
+                       /* Release the IO refcnt */
+                       ifnet_decr_iorefcnt(ifp);
+               }
+
+               /*
+               * NOTE warning %%% attention !!!!
+               * We should think about putting some thread starvation
+               * safeguards if we deal with long chains of packets.
+               */
+               if (m != NULL)
+                       dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
+       }
+
+       /* NOTREACHED */
+       VERIFY(0);      /* we should never get here */
+}
+
+static void
+dlil_rxpoll_calc_limits(struct dlil_threading_info *inp)
+{
+       struct ifnet *ifp = inp->ifp;
+       u_int64_t sample_holdtime, inbw;
+
+       VERIFY(inp != dlil_main_input_thread);
+       VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
+
+       if ((inbw = ifnet_input_linkrate(ifp)) == 0) {
+               sample_holdtime = 0;    /* polling is disabled */
+               inp->rxpoll_wlowat = inp->rxpoll_plowat =
+                   inp->rxpoll_blowat = 0;
+               inp->rxpoll_whiwat = inp->rxpoll_phiwat =
+                   inp->rxpoll_bhiwat = (u_int32_t)-1;
+       } else {
+               unsigned int n, i;
+
+               n = 0;
+               for (i = 0; rxpoll_tbl[i].speed != 0; i++) {
+                       if (inbw < rxpoll_tbl[i].speed)
+                               break;
+                       n = i;
+               }
+               sample_holdtime = if_rxpoll_sample_holdtime;
+               inp->rxpoll_wlowat = if_rxpoll_wlowat;
+               inp->rxpoll_whiwat = if_rxpoll_whiwat;
+               inp->rxpoll_plowat = rxpoll_tbl[n].plowat;
+               inp->rxpoll_phiwat = rxpoll_tbl[n].phiwat;
+               inp->rxpoll_blowat = rxpoll_tbl[n].blowat;
+               inp->rxpoll_bhiwat = rxpoll_tbl[n].bhiwat;
+       }
+
+       net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime);
+       net_nsectimer(&sample_holdtime, &inp->sample_holdtime);
+
+       if (dlil_verbose) {
+               printf("%s%d: speed %llu bps, sample per %llu nsec, "
+                   "pkt limits [%d/%d], wreq limits [%d/%d], "
+                   "bytes limits [%d/%d]\n", ifp->if_name, ifp->if_unit,
+                   inbw, sample_holdtime, inp->rxpoll_plowat,
+                   inp->rxpoll_phiwat, inp->rxpoll_wlowat, inp->rxpoll_whiwat,
+                   inp->rxpoll_blowat, inp->rxpoll_bhiwat);
+       }
+}
+
+errno_t
+ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
+    const struct ifnet_stat_increment_param *s)
+{
+       return (ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE));
+}
+
+errno_t
+ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
+    struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
+{
+       return (ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE));
+}
+
+static errno_t
+ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
+    const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
+{
+       struct thread *tp = current_thread();
+       struct mbuf *last;
+       struct dlil_threading_info *inp;
+       u_int32_t m_cnt = 0, m_size = 0;
+
+       /*
+        * Drop the packet(s) if the parameters are invalid, or if the
+        * interface is no longer attached; else hold an IO refcnt to
+        * prevent it from being detached (will be released below.)
+        */
+       if (ifp == NULL || m_head == NULL || (s == NULL && ext) ||
+           (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) {
+               if (m_head != NULL)
+                       mbuf_freem_list(m_head);
+               return (EINVAL);
+       }
+
+       VERIFY(m_tail == NULL || ext);
+       VERIFY(s != NULL || !ext);
+
+       if (m_tail == NULL) {
+               last = m_head;
+               while (1) {
+#if IFNET_INPUT_SANITY_CHK
+                       if (dlil_input_sanity_check != 0)
+                               DLIL_INPUT_CHECK(last, ifp);
+#endif /* IFNET_INPUT_SANITY_CHK */
+                       m_cnt++;
+                       m_size += m_length(last);
+                       if (mbuf_nextpkt(last) == NULL)
+                               break;
+                       last = mbuf_nextpkt(last);
+               }
+               m_tail = last;
+       } else {
+#if IFNET_INPUT_SANITY_CHK
+               if (dlil_input_sanity_check != 0) {
+                       last = m_head;
+                       while (1) {
+                               DLIL_INPUT_CHECK(last, ifp);
+                               m_cnt++;
+                               m_size += m_length(last);
+                               if (mbuf_nextpkt(last) == NULL)
+                                       break;
+                               last = mbuf_nextpkt(last);
+                       }
+               } else {
+                       m_cnt = s->packets_in;
+                       m_size = s->bytes_in;
+                       last = m_tail;
+               }
+#else
+               m_cnt = s->packets_in;
+               m_size = s->bytes_in;
+               last = m_tail;
+#endif /* IFNET_INPUT_SANITY_CHK */
+       }
+
+       if (last != m_tail) {
+               panic_plain("%s: invalid input packet chain for %s%d, "
+                   "tail mbuf %p instead of %p\n", __func__, ifp->if_name,
+                   ifp->if_unit, m_tail, last);
+       }
+
+       /*
+        * Assert packet count only for the extended variant, for backwards
+        * compatibility, since this came directly from the device driver.
+        * Relax this assertion for input bytes, as the driver may have
+        * included the link-layer headers in the computation; hence
+        * m_size is just an approximation.
+        */
+       if (ext && s->packets_in != m_cnt) {
+               panic_plain("%s: input packet count mismatch for %s%d, "
+                   "%d instead of %d\n", __func__, ifp->if_name,
+                   ifp->if_unit, s->packets_in, m_cnt);
+       }
+
+       if ((inp = ifp->if_inp) == NULL)
+               inp = dlil_main_input_thread;
+
+       /*
+        * If there is a matching DLIL input thread associated with an
+        * affinity set, associate this thread with the same set.  We
+        * will only do this once.
+        */
+       lck_mtx_lock_spin(&inp->input_lck);
+       if (inp != dlil_main_input_thread && inp->net_affinity &&
+           ((!poll && inp->wloop_thr == THREAD_NULL) ||
+           (poll && inp->poll_thr == THREAD_NULL))) {
+               u_int32_t tag = inp->tag;
+
+               if (poll) {
+                       VERIFY(inp->poll_thr == THREAD_NULL);
+                       inp->poll_thr = tp;
+               } else {
+                       VERIFY(inp->wloop_thr == THREAD_NULL);
+                       inp->wloop_thr = tp;
+               }
+               lck_mtx_unlock(&inp->input_lck);
+
+               /* Associate the current thread with the new affinity tag */
+               (void) dlil_affinity_set(tp, tag);
+
+               /*
+                * Take a reference on the current thread; during detach,
+                * we will need to refer to it in order ot tear down its
+                * affinity.
+                */
+               thread_reference(tp);
+               lck_mtx_lock_spin(&inp->input_lck);
+       }
+
+        /*
+        * Because of loopbacked multicast we cannot stuff the ifp in
+        * the rcvif of the packet header: loopback (lo0) packets use a
+        * dedicated list so that we can later associate them with lo_ifp
+        * on their way up the stack.  Packets for other interfaces without
+        * dedicated input threads go to the regular list.
+        */
+       if (inp == dlil_main_input_thread && ifp == lo_ifp) {
+               struct dlil_main_threading_info *inpm =
+                   (struct dlil_main_threading_info *)inp;
+               _addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail, m_cnt, m_size);
+       } else {
+               _addq_multi(&inp->rcvq_pkts, m_head, m_tail, m_cnt, m_size);
+       }
+
+#if IFNET_INPUT_SANITY_CHK
+       if (dlil_input_sanity_check != 0) {
+               u_int32_t count;
+               struct mbuf *m0;
+
+               for (m0 = m_head, count = 0; m0; m0 = mbuf_nextpkt(m0))
+                       count++;
+
+               if (count != m_cnt) {
+                       panic_plain("%s%d: invalid packet count %d "
+                           "(expected %d)\n", ifp->if_name, ifp->if_unit,
+                           count, m_cnt);
+                       /* NOTREACHED */
+               }
+
+               inp->input_mbuf_cnt += m_cnt;
+       }
+#endif /* IFNET_INPUT_SANITY_CHK */
+
+       if (s != NULL) {
+               dlil_input_stats_add(s, inp, poll);
+               /*
+                * If we're using the main input thread, synchronize the
+                * stats now since we have the interface context.  All
+                * other cases involving dedicated input threads will
+                * have their stats synchronized there.
+                */
+               if (inp == dlil_main_input_thread)
+                       dlil_input_stats_sync(ifp, inp);
+       }
+
+       inp->input_waiting |= DLIL_INPUT_WAITING;
+       if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
+               inp->wtot++;
+               wakeup_one((caddr_t)&inp->input_waiting);
+       }
+       lck_mtx_unlock(&inp->input_lck);
+
+       if (ifp != lo_ifp) {
+               /* Release the IO refcnt */
+               ifnet_decr_iorefcnt(ifp);
+       }
+
+       return (0);
+}
+
+void
+ifnet_start(struct ifnet *ifp)
+{
+       /*
+        * If the starter thread is inactive, signal it to do work.
+        */
+       lck_mtx_lock_spin(&ifp->if_start_lock);
+       ifp->if_start_req++;
+       if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL) {
+               wakeup_one((caddr_t)&ifp->if_start_thread);
+       }
+       lck_mtx_unlock(&ifp->if_start_lock);
+}
+
+static void
+ifnet_start_thread_fn(void *v, wait_result_t w)
+{
+#pragma unused(w)
+       struct ifnet *ifp = v;
+       char ifname[IFNAMSIZ + 1];
+       struct timespec *ts = NULL;
+       struct ifclassq *ifq = &ifp->if_snd;
+
+       /*
+        * Treat the dedicated starter thread for lo0 as equivalent to
+        * the driver workloop thread; if net_affinity is enabled for
+        * the main input thread, associate this starter thread to it
+        * by binding them with the same affinity tag.  This is done
+        * only once (as we only have one lo_ifp which never goes away.)
+        */
+       if (ifp == lo_ifp) {
+               struct dlil_threading_info *inp = dlil_main_input_thread;
+               struct thread *tp = current_thread();
+
+               lck_mtx_lock(&inp->input_lck);
+               if (inp->net_affinity) {
+                       u_int32_t tag = inp->tag;
+
+                       VERIFY(inp->wloop_thr == THREAD_NULL);
+                       VERIFY(inp->poll_thr == THREAD_NULL);
+                       inp->wloop_thr = tp;
+                       lck_mtx_unlock(&inp->input_lck);
+
+                       /* Associate this thread with the affinity tag */
+                       (void) dlil_affinity_set(tp, tag);
+               } else {
+                       lck_mtx_unlock(&inp->input_lck);
+               }
+       }
+
+       snprintf(ifname, sizeof (ifname), "%s%d_starter",
+           ifp->if_name, ifp->if_unit);
+
+       lck_mtx_lock_spin(&ifp->if_start_lock);
+
+       for (;;) {
+               (void) msleep(&ifp->if_start_thread, &ifp->if_start_lock,
+                   (PZERO - 1) | PSPIN, ifname, ts);
+
+               /* interface is detached? */
+               if (ifp->if_start_thread == THREAD_NULL) {
+                       ifnet_set_start_cycle(ifp, NULL);
+                       lck_mtx_unlock(&ifp->if_start_lock);
+                       ifnet_purge(ifp);
+
+                       if (dlil_verbose) {
+                               printf("%s%d: starter thread terminated\n",
+                                   ifp->if_name, ifp->if_unit);
+                       }
+
+                       /* for the extra refcnt from kernel_thread_start() */
+                       thread_deallocate(current_thread());
+                       /* this is the end */
+                       thread_terminate(current_thread());
+                       /* NOTREACHED */
+                       return;
+               }
+
+               ifp->if_start_active = 1;
+               for (;;) {
+                       u_int32_t req = ifp->if_start_req;
+
+                       lck_mtx_unlock(&ifp->if_start_lock);
+                       /* invoke the driver's start routine */
+                       ((*ifp->if_start)(ifp));
+                       lck_mtx_lock_spin(&ifp->if_start_lock);
+
+                       /* if there's no pending request, we're done */
+                       if (req == ifp->if_start_req)
+                               break;
+               }
+               ifp->if_start_req = 0;
+               ifp->if_start_active = 0;
+               /*
+                * Wakeup N ns from now if rate-controlled by TBR, and if
+                * there are still packets in the send queue which haven't
+                * been dequeued so far; else sleep indefinitely (ts = NULL)
+                * until ifnet_start() is called again.
+                */
+               ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
+                   &ifp->if_start_cycle : NULL);
+
+               if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0)
+                       ts = NULL;
+       }
+
+       /* NOTREACHED */
+       lck_mtx_unlock(&ifp->if_start_lock);
+       VERIFY(0);      /* we should never get here */
+}
+
+void
+ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
+{
+       if (ts == NULL)
+               bzero(&ifp->if_start_cycle, sizeof (ifp->if_start_cycle));
+       else
+               *(&ifp->if_start_cycle) = *ts;
+
+       if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
+               printf("%s%d: restart interval set to %lu nsec\n",
+                   ifp->if_name, ifp->if_unit, ts->tv_nsec);
+}
+
+static void
+ifnet_poll(struct ifnet *ifp)
+{
+       /*
+        * If the poller thread is inactive, signal it to do work.
+        */
+       lck_mtx_lock_spin(&ifp->if_poll_lock);
+       ifp->if_poll_req++;
+       if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) {
+               wakeup_one((caddr_t)&ifp->if_poll_thread);
+       }
+       lck_mtx_unlock(&ifp->if_poll_lock);
+}
+
+static void
+ifnet_poll_thread_fn(void *v, wait_result_t w)
+{
+#pragma unused(w)
+       struct dlil_threading_info *inp;
+       struct ifnet *ifp = v;
+       char ifname[IFNAMSIZ + 1];
+       struct timespec *ts = NULL;
+       struct ifnet_stat_increment_param s;
+
+       snprintf(ifname, sizeof (ifname), "%s%d_poller",
+           ifp->if_name, ifp->if_unit);
+       bzero(&s, sizeof (s));
+
+       lck_mtx_lock_spin(&ifp->if_poll_lock);
+
+       inp = ifp->if_inp;
+       VERIFY(inp != NULL);
+
+       for (;;) {
+               if (ifp->if_poll_thread != THREAD_NULL) {
+                       (void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock,
+                           (PZERO - 1) | PSPIN, ifname, ts);
+               }
+
+               /* interface is detached (maybe while asleep)? */
+               if (ifp->if_poll_thread == THREAD_NULL) {
+                       ifnet_set_poll_cycle(ifp, NULL);
+                       lck_mtx_unlock(&ifp->if_poll_lock);
+
+                       if (dlil_verbose) {
+                               printf("%s%d: poller thread terminated\n",
+                                   ifp->if_name, ifp->if_unit);
+                       }
+
+                       /* for the extra refcnt from kernel_thread_start() */
+                       thread_deallocate(current_thread());
+                       /* this is the end */
+                       thread_terminate(current_thread());
+                       /* NOTREACHED */
+                       return;
+               }
+
+               ifp->if_poll_active = 1;
+               for (;;) {
+                       struct mbuf *m_head, *m_tail;
+                       u_int32_t m_lim, m_cnt, m_totlen;
+                       u_int16_t req = ifp->if_poll_req;
+
+                       lck_mtx_unlock(&ifp->if_poll_lock);
+
+                       /*
+                        * If no longer attached, there's nothing to do;
+                        * else hold an IO refcnt to prevent the interface
+                        * from being detached (will be released below.)
+                        */
+                       if (!ifnet_is_attached(ifp, 1))
+                               break;
+
+                       m_lim = (if_rxpoll_max != 0) ? if_rxpoll_max :
+                           MAX((qlimit(&inp->rcvq_pkts)),
+                           (inp->rxpoll_phiwat << 2));
+
+                       if (dlil_verbose > 1) {
+                               printf("%s%d: polling up to %d pkts, "
+                                   "pkts avg %d max %d, wreq avg %d, "
+                                   "bytes avg %d\n",
+                                   ifp->if_name, ifp->if_unit, m_lim,
+                                   inp->rxpoll_pavg, inp->rxpoll_pmax,
+                                   inp->rxpoll_wavg, inp->rxpoll_bavg);
+                       }
+
+                       /* invoke the driver's input poll routine */
+                       ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
+                           &m_cnt, &m_totlen));
+
+                       if (m_head != NULL) {
+                               VERIFY(m_tail != NULL && m_cnt > 0);
+
+                               if (dlil_verbose > 1) {
+                                       printf("%s%d: polled %d pkts, "
+                                           "pkts avg %d max %d, wreq avg %d, "
+                                           "bytes avg %d\n",
+                                           ifp->if_name, ifp->if_unit, m_cnt,
+                                           inp->rxpoll_pavg, inp->rxpoll_pmax,
+                                           inp->rxpoll_wavg, inp->rxpoll_bavg);
+                               }
+
+                               /* stats are required for extended variant */
+                               s.packets_in = m_cnt;
+                               s.bytes_in = m_totlen;
+
+                               (void) ifnet_input_common(ifp, m_head, m_tail,
+                                   &s, TRUE, TRUE);
+                       } else if (dlil_verbose > 1) {
+                               printf("%s%d: no packets, pkts avg %d max %d, "
+                                   "wreq avg %d, bytes avg %d\n", ifp->if_name,
+                                   ifp->if_unit, inp->rxpoll_pavg,
+                                   inp->rxpoll_pmax, inp->rxpoll_wavg,
+                                   inp->rxpoll_bavg);
+                       }
+
+                       /* Release the io ref count */
+                       ifnet_decr_iorefcnt(ifp);
+
+                       lck_mtx_lock_spin(&ifp->if_poll_lock);
+
+                       /* if there's no pending request, we're done */
+                       if (req == ifp->if_poll_req)
+                               break;
+               }
+               ifp->if_poll_req = 0;
+               ifp->if_poll_active = 0;
+
+               /*
+                * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
+                * until ifnet_poll() is called again.
+                */
+               ts = &ifp->if_poll_cycle;
+               if (ts->tv_sec == 0 && ts->tv_nsec == 0)
+                       ts = NULL;
+       }
+
+       /* NOTREACHED */
+       lck_mtx_unlock(&ifp->if_poll_lock);
+       VERIFY(0);      /* we should never get here */
+}
+
+void
+ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
+{
+       if (ts == NULL)
+               bzero(&ifp->if_poll_cycle, sizeof (ifp->if_poll_cycle));
+       else
+               *(&ifp->if_poll_cycle) = *ts;
+
+       if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
+               printf("%s%d: poll interval set to %lu nsec\n",
+                   ifp->if_name, ifp->if_unit, ts->tv_nsec);
+}
+
+void
+ifnet_purge(struct ifnet *ifp)
+{
+       if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART))
+               if_qflush(ifp, 0);
+}
+
+void
+ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
+{
+       IFCQ_LOCK_ASSERT_HELD(ifq);
+
+       if (!(IFCQ_IS_READY(ifq)))
+               return;
+
+       if (IFCQ_TBR_IS_ENABLED(ifq)) {
+               struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw,
+                   ifq->ifcq_tbr.tbr_percent, 0 };
+               (void) ifclassq_tbr_set(ifq, &tb, FALSE);
+       }
+
+       ifclassq_update(ifq, ev);
+}
+
+void
+ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
+{
+       switch (ev) {
+       case CLASSQ_EV_LINK_SPEED:
+               if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL))
+                       ifp->if_poll_update++;
+               break;
+
+       default:
+               break;
+       }
+}
+
+errno_t
+ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
+{
+       struct ifclassq *ifq;
+       u_int32_t omodel;
+       errno_t err;
+
+       if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED &&
+           model != IFNET_SCHED_MODEL_NORMAL))
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       ifq = &ifp->if_snd;
+       IFCQ_LOCK(ifq);
+       omodel = ifp->if_output_sched_model;
+       ifp->if_output_sched_model = model;
+       if ((err = ifclassq_pktsched_setup(ifq)) != 0)
+               ifp->if_output_sched_model = omodel;
+       IFCQ_UNLOCK(ifq);
+
+       return (err);
+}
+
+errno_t
+ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
+{
+       if (ifp == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       ifclassq_set_maxlen(&ifp->if_snd, maxqlen);
+
+       return (0);
+}
+
+errno_t
+ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
+{
+       if (ifp == NULL || maxqlen == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       *maxqlen = ifclassq_get_maxlen(&ifp->if_snd);
+
+       return (0);
+}
+
+errno_t
+ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *qlen)
+{
+       if (ifp == NULL || qlen == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       *qlen = ifclassq_get_len(&ifp->if_snd);
+
+       return (0);
+}
+
+errno_t
+ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
+{
+       struct dlil_threading_info *inp;
+
+       if (ifp == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
+               return (ENXIO);
+
+       if (maxqlen == 0)
+               maxqlen = if_rcvq_maxlen;
+       else if (maxqlen < IF_RCVQ_MINLEN)
+               maxqlen = IF_RCVQ_MINLEN;
+
+       inp = ifp->if_inp;
+       lck_mtx_lock(&inp->input_lck);
+       qlimit(&inp->rcvq_pkts) = maxqlen;
+       lck_mtx_unlock(&inp->input_lck);
+
+       return (0);
+}
+
+errno_t
+ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
+{
+       struct dlil_threading_info *inp;
+
+       if (ifp == NULL || maxqlen == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
+               return (ENXIO);
+
+       inp = ifp->if_inp;
+       lck_mtx_lock(&inp->input_lck);
+       *maxqlen = qlimit(&inp->rcvq_pkts);
+       lck_mtx_unlock(&inp->input_lck);
+       return (0);
+}
+
+errno_t
+ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
+{
+       int error;
+
+       if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
+           m->m_nextpkt != NULL) {
+               if (m != NULL)
+                       m_freem_list(m);
+               return (EINVAL);
+       } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
+           !(ifp->if_refflags & IFRF_ATTACHED)) {
+               /* flag tested without lock for performance */
+               m_freem(m);
+               return (ENXIO);
+       } else if (!(ifp->if_flags & IFF_UP)) {
+               m_freem(m);
+               return (ENETDOWN);
+               
+       }
+
+       /* enqueue the packet */
+       error = ifclassq_enqueue(&ifp->if_snd, m);
+
+       /*
+        * Tell the driver to start dequeueing; do this even when the queue
+        * for the packet is suspended (EQSUSPENDED), as the driver could still
+        * be dequeueing from other unsuspended queues.
+        */
+       if (error == 0 || error == EQFULL || error == EQSUSPENDED)
+               ifnet_start(ifp);
+
+       return (error);
+}
+
+errno_t
+ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
+{
+       if (ifp == NULL || mp == NULL)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART) ||
+           (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
+               return (ENXIO);
+
+       return (ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL));
+}
+
+errno_t
+ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
+    struct mbuf **mp)
+{
+       if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc))
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART) ||
+           (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
+               return (ENXIO);
+
+       return (ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL));
+}
+
+errno_t
+ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head,
+    struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
+{
+       if (ifp == NULL || head == NULL || limit < 1)
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART) ||
+           (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
+               return (ENXIO);
+
+       return (ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len));
+}
+
+errno_t
+ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
+    u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
+    u_int32_t *len)
+{
+
+       if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc))
+               return (EINVAL);
+       else if (!(ifp->if_eflags & IFEF_TXSTART) ||
+           (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
+               return (ENXIO);
+
+       return (ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head,
+           tail, cnt, len));
+}
+
+static int
+dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
+    char **frame_header_p, protocol_family_t protocol_family)
+{
+       struct ifnet_filter *filter;
+
+       /*
+        * Pass the inbound packet to the interface filters
         */
        lck_mtx_lock_spin(&ifp->if_flt_lock);
        /* prevent filter list from changing in case we drop the lock */
@@ -1458,8 +2720,110 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
        return;
 }
 
+static void
+dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
+    struct dlil_threading_info *inp, boolean_t poll)
+{
+       struct ifnet_stat_increment_param *d = &inp->stats;
+
+       if (s->packets_in != 0)
+               d->packets_in += s->packets_in;
+       if (s->bytes_in != 0)
+               d->bytes_in += s->bytes_in;
+       if (s->errors_in != 0)
+               d->errors_in += s->errors_in;
+
+       if (s->packets_out != 0)
+               d->packets_out += s->packets_out;
+       if (s->bytes_out != 0)
+               d->bytes_out += s->bytes_out;
+       if (s->errors_out != 0)
+               d->errors_out += s->errors_out;
+
+       if (s->collisions != 0)
+               d->collisions += s->collisions;
+       if (s->dropped != 0)
+               d->dropped += s->dropped;
+
+       if (poll)
+               PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in);
+}
+
+static void
+dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
+{
+       struct ifnet_stat_increment_param *s = &inp->stats;
+
+       /*
+        * Use of atomic operations is unavoidable here because
+        * these stats may also be incremented elsewhere via KPIs.
+        */
+       if (s->packets_in != 0) {
+               atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
+               s->packets_in = 0;
+       }
+       if (s->bytes_in != 0) {
+               atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
+               s->bytes_in = 0;
+       }
+       if (s->errors_in != 0) {
+               atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
+               s->errors_in = 0;
+       }
+
+       if (s->packets_out != 0) {
+               atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
+               s->packets_out = 0;
+       }
+       if (s->bytes_out != 0) {
+               atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
+               s->bytes_out = 0;
+       }
+       if (s->errors_out != 0) {
+               atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
+               s->errors_out = 0;
+       }
+
+       if (s->collisions != 0) {
+               atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
+               s->collisions = 0;
+       }
+       if (s->dropped != 0) {
+               atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
+               s->dropped = 0;
+       }
+
+       /*
+        * No need for atomic operations as they are modified here
+        * only from within the DLIL input thread context.
+        */
+       if (inp->tstats.packets != 0) {
+               inp->pstats.ifi_poll_packets += inp->tstats.packets;
+               inp->tstats.packets = 0;
+       }
+       if (inp->tstats.bytes != 0) {
+               inp->pstats.ifi_poll_bytes += inp->tstats.bytes;
+               inp->tstats.bytes = 0;
+       }
+}
+
+__private_extern__ void
+dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
+{
+       return (dlil_input_packet_list_common(ifp, m, 0,
+           IFNET_MODEL_INPUT_POLL_OFF, FALSE));
+}
+
 __private_extern__ void
-dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m)
+dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
+    u_int32_t cnt, ifnet_model_t mode)
+{
+       return (dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE));
+}
+
+static void
+dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
+    u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
 {
        int                             error = 0;
        protocol_family_t               protocol_family;
@@ -1469,9 +2833,13 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m)
        struct if_proto *               last_ifproto = NULL;
        mbuf_t                          pkt_first = NULL;
        mbuf_t *                        pkt_next = NULL;
+       u_int32_t                       poll_thresh = 0, poll_ival = 0;
 
        KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0);
 
+       if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
+           (poll_ival = if_rxpoll_interval_pkts) > 0)
+               poll_thresh = cnt;
 
        while (m != NULL) {
                struct if_proto *ifproto = NULL;
@@ -1480,16 +2848,22 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m)
                if (ifp_param == NULL)
                        ifp = m->m_pkthdr.rcvif;
 
+               if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 &&
+                   poll_ival > 0 && (--poll_thresh % poll_ival) == 0)
+                       ifnet_poll(ifp);
+
                /* Check if this mbuf looks valid */
-               MBUF_INPUT_CHECK(m, ifp); 
+               MBUF_INPUT_CHECK(m, ifp);
 
                next_packet = m->m_nextpkt;
                m->m_nextpkt = NULL;
                frame_header = m->m_pkthdr.header;
                m->m_pkthdr.header = NULL;
 
-               /* Get an IO reference count if the interface is not 
-                * loopback and it is attached.
+               /*
+                * Get an IO reference count if the interface is not
+                * loopback (lo0) and it is attached; lo0 never goes
+                * away, so optimize for that.
                 */
                if (ifp != lo_ifp) {
                        if (!ifnet_is_attached(ifp, 1)) {
@@ -1499,22 +2873,7 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m)
                        iorefcnt = 1;
                }
 
-               switch (m->m_pkthdr.prio) {
-                       case MBUF_TC_BK:
-                               atomic_add_64(&ifp->if_tc.ifi_ibkpackets, 1);
-                               atomic_add_64(&ifp->if_tc.ifi_ibkbytes, m->m_pkthdr.len);
-                               break;
-                       case MBUF_TC_VI:
-                               atomic_add_64(&ifp->if_tc.ifi_ivipackets, 1);
-                               atomic_add_64(&ifp->if_tc.ifi_ivibytes, m->m_pkthdr.len);
-                               break;
-                       case MBUF_TC_VO:
-                               atomic_add_64(&ifp->if_tc.ifi_ivopackets, 1);
-                               atomic_add_64(&ifp->if_tc.ifi_ivobytes, m->m_pkthdr.len);
-                               break;
-                       default:
-                               break;
-               }
+               ifp_inc_traffic_class_in(ifp, m);
 
                /* find which protocol family this packet is for */
                ifnet_lock_shared(ifp);
@@ -1527,6 +2886,10 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m)
                        protocol_family = 0;
                }
 
+#if CONFIG_EMBEDDED
+               iptap_ipf_input(ifp, protocol_family, m, frame_header);
+#endif /* CONFIG_EMBEDDED */
+
                if (m->m_flags & (M_BCAST|M_MCAST))
                        atomic_add_64(&ifp->if_imcasts, 1);
 
@@ -1595,7 +2958,7 @@ next:
                        if_proto_free(ifproto);
                        ifproto = NULL;
                }
-               
+
                m = next_packet;
 
                /* update the driver's multicast filter, if needed */
@@ -1606,7 +2969,6 @@ next:
        }
 
        KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0);
-       return;
 }
 
 errno_t
@@ -1760,27 +3122,78 @@ dlil_get_socket_type(struct mbuf **mp, int family, int raw)
 }
 #endif
 
-static void
-if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m)
+/*
+ * This is mostly called from the context of the DLIL input thread;
+ * because of that there is no need for atomic operations.
+ */
+static __inline void
+ifp_inc_traffic_class_in(struct ifnet *ifp, struct mbuf *m)
 {
        if (!(m->m_flags & M_PKTHDR))
                return;
 
-       switch (m->m_pkthdr.prio) {
-               case MBUF_TC_BK:
-                       atomic_add_64(&ifp->if_tc.ifi_obkpackets, 1);
-                       atomic_add_64(&ifp->if_tc.ifi_obkbytes, m->m_pkthdr.len);
-                       break;
-               case MBUF_TC_VI:
-                       atomic_add_64(&ifp->if_tc.ifi_ovipackets, 1);
-                       atomic_add_64(&ifp->if_tc.ifi_ovibytes, m->m_pkthdr.len);
-                       break;
-               case MBUF_TC_VO:
-                       atomic_add_64(&ifp->if_tc.ifi_ovopackets, 1);
-                       atomic_add_64(&ifp->if_tc.ifi_ovobytes, m->m_pkthdr.len);
-                       break;
-               default:
-                       break;
+       switch (m_get_traffic_class(m)) {
+       case MBUF_TC_BE:
+               ifp->if_tc.ifi_ibepackets++;
+               ifp->if_tc.ifi_ibebytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_BK:
+               ifp->if_tc.ifi_ibkpackets++;
+               ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_VI:
+               ifp->if_tc.ifi_ivipackets++;
+               ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_VO:
+               ifp->if_tc.ifi_ivopackets++;
+               ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len;
+               break;
+       default:
+               break;
+       }
+
+       if (mbuf_is_traffic_class_privileged(m)) {
+               ifp->if_tc.ifi_ipvpackets++;
+               ifp->if_tc.ifi_ipvbytes += m->m_pkthdr.len;
+       }
+}
+
+/*
+ * This is called from DLIL output, hence multiple threads could end
+ * up modifying the statistics.  We trade off acccuracy for performance
+ * by not using atomic operations here.
+ */
+static __inline void
+ifp_inc_traffic_class_out(struct ifnet *ifp, struct mbuf *m)
+{
+       if (!(m->m_flags & M_PKTHDR))
+               return;
+
+       switch (m_get_traffic_class(m)) {
+       case MBUF_TC_BE:
+               ifp->if_tc.ifi_obepackets++;
+               ifp->if_tc.ifi_obebytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_BK:
+               ifp->if_tc.ifi_obkpackets++;
+               ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_VI:
+               ifp->if_tc.ifi_ovipackets++;
+               ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len;
+               break;
+       case MBUF_TC_VO:
+               ifp->if_tc.ifi_ovopackets++;
+               ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len;
+               break;
+       default:
+               break;
+       }
+
+       if (mbuf_is_traffic_class_privileged(m)) {
+               ifp->if_tc.ifi_opvpackets++;
+               ifp->if_tc.ifi_opvbytes += m->m_pkthdr.len;
        }
 }
 
@@ -1796,10 +3209,14 @@ if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m)
  * an interface lock if we're going to take both. This makes sense
  * because a protocol is likely to interact with an ifp while it
  * is under the protocol lock.
+ *
+ * An advisory code will be returned if adv is not null. This
+ * can be used to provide feedback about interface queues to the 
+ * application.
  */
 errno_t
 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
-    void *route, const struct sockaddr *dest, int raw)
+    void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
 {
        char *frame_type = NULL;
        char *dst_linkaddr = NULL;
@@ -1811,6 +3228,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
        mbuf_t  send_head = NULL;
        mbuf_t  *send_tail = &send_head;
        int iorefcnt = 0;
+#if CONFIG_EMBEDDED
+       u_int32_t pre = 0, post = 0;
+#endif /* CONFIG_EMBEDDED */
 
        KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0);
 
@@ -1877,13 +3297,13 @@ preout_again:
 
        do {
 #if CONFIG_DTRACE
-               if (proto_family == PF_INET) {
+               if (!raw && proto_family == PF_INET) {
                        struct ip *ip = mtod(m, struct ip*);
                        DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
                                struct ip *, ip, struct ifnet *, ifp,
                                struct ip *, ip, struct ip6_hdr *, NULL);
 
-               } else if (proto_family == PF_INET6) {
+               } else if (!raw && proto_family == PF_INET6) {
                        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*);
                        DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL,
                                struct ip6_hdr *, ip6, struct ifnet*, ifp,
@@ -1909,7 +3329,12 @@ preout_again:
                        }
 
                        retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
-                           frame_type);
+                           frame_type
+#if CONFIG_EMBEDDED
+                           ,
+                           &pre, &post
+#endif /* CONFIG_EMBEDDED */
+                           );
                        if (retval) {
                                if (retval != EJUSTRETURN)
                                        m_freem(m);
@@ -1987,10 +3412,22 @@ preout_again:
                        *send_tail = m;
                        send_tail = &m->m_nextpkt;
                } else {
-                       if_inc_traffic_class_out(ifp, m);
+#if CONFIG_EMBEDDED
+                       iptap_ipf_output(ifp, proto_family, (struct mbuf *)m,
+                           pre, post);
+#endif /* CONFIG_EMBEDDED */
+                       ifp_inc_traffic_class_out(ifp, m);
                        KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
                            0,0,0,0,0);
-                       retval = ifp->if_output(ifp, m);
+                       retval = (*ifp->if_output)(ifp, m);
+                       if (retval == EQFULL || retval == EQSUSPENDED) {
+                               if (adv != NULL && adv->code == FADV_SUCCESS) {
+                                       adv->code = (retval == EQFULL ?
+                                           FADV_FLOW_CONTROLLED :
+                                           FADV_SUSPENDED);
+                               }
+                               retval = 0;
+                       }
                        if (retval && dlil_verbose) {
                                printf("%s: output error on %s%d retval = %d\n",
                                    __func__, ifp->if_name, ifp->if_unit,
@@ -2010,10 +3447,21 @@ next:
        } while (m);
 
        if (send_head) {
-               if_inc_traffic_class_out(ifp, send_head);
+#if CONFIG_EMBEDDED
+               iptap_ipf_output(ifp, proto_family, (struct mbuf *)send_head,
+                   pre, post);
+#endif /* CONFIG_EMBEDDED */
+               ifp_inc_traffic_class_out(ifp, send_head);
 
                KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0);
-               retval = ifp->if_output(ifp, send_head);
+               retval = (*ifp->if_output)(ifp, send_head);
+               if (retval == EQFULL || retval == EQSUSPENDED) {
+                       if (adv != NULL) {
+                               adv->code = (retval == EQFULL ?
+                                   FADV_FLOW_CONTROLLED : FADV_SUSPENDED);
+                       }
+                       retval = 0;
+               }
                if (retval && dlil_verbose) {
                        printf("%s: output error on %s%d retval = %d\n",
                            __func__, ifp->if_name, ifp->if_unit, retval);
@@ -2183,7 +3631,7 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
                    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
                if (resolvep != NULL)
                        result = resolvep(ifp, proto_addr,
-                           (struct sockaddr_dl*)ll_addr, ll_len);
+                           (struct sockaddr_dl*)(void *)ll_addr, ll_len);
                if_proto_free(proto);
        }
 
@@ -2229,6 +3677,31 @@ dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
        return (result);
 }
 
+__private_extern__ errno_t
+net_thread_check_lock(u_int32_t flag)
+{
+       struct uthread *uth = get_bsdthread_info(current_thread());
+       return ((uth->uu_network_lock_held & flag) == flag);
+}
+
+__private_extern__ void
+net_thread_set_lock(u_int32_t flag)
+{
+       struct uthread *uth = get_bsdthread_info(current_thread());
+
+       VERIFY((uth->uu_network_lock_held & flag) != flag);
+       uth->uu_network_lock_held |= flag;
+}
+
+__private_extern__ void
+net_thread_unset_lock(u_int32_t flag)
+{
+       struct uthread *uth = get_bsdthread_info(current_thread());
+
+       VERIFY((uth->uu_network_lock_held & flag) == flag);
+       uth->uu_network_lock_held &= (~flag);
+}
+
 static __inline__ int
 _is_announcement(const struct sockaddr_in * sender_sin,
     const struct sockaddr_in * target_sin)
@@ -2242,24 +3715,37 @@ _is_announcement(const struct sockaddr_in * sender_sin,
 __private_extern__ errno_t
 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
     const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw,
-    const struct sockaddr* target_proto)
+    const struct sockaddr* target_proto0, u_int32_t rtflags)
 {
        errno_t result = 0;
        const struct sockaddr_in * sender_sin;
        const struct sockaddr_in * target_sin;
+       struct sockaddr_inarp target_proto_sinarp;
+       struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
 
        if (target_proto == NULL || (sender_proto != NULL &&
            sender_proto->sa_family != target_proto->sa_family))
                return (EINVAL);
 
+       /*
+        * If the target is a (default) router, provide that
+        * information to the send_arp callback routine.
+        */
+       if (rtflags & RTF_ROUTER) {
+               bcopy(target_proto, &target_proto_sinarp,
+                   sizeof (struct sockaddr_in));
+               target_proto_sinarp.sin_other |= SIN_ROUTER;
+               target_proto = (struct sockaddr *)&target_proto_sinarp;
+       }
+
        /*
         * If this is an ARP request and the target IP is IPv4LL,
         * send the request on all interfaces.  The exception is
         * an announcement, which must only appear on the specific
         * interface.
         */
-       sender_sin = (const struct sockaddr_in *)sender_proto;
-       target_sin = (const struct sockaddr_in *)target_proto;
+       sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
+       target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
        if (target_proto->sa_family == AF_INET &&
            IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
            ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
@@ -2298,7 +3784,7 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
                                                /* Copy the source IP address */
                                                source_ip_copy =
                                                    *(struct sockaddr_in *)
-                                                   source_ip->ifa_addr;
+                                                   (void *)source_ip->ifa_addr;
                                                IFA_UNLOCK(source_ip);
                                                break;
                                        }
@@ -2316,8 +3802,8 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
 
                                /* Send the ARP */
                                new_result = dlil_send_arp_internal(cur_ifp,
-                                   arpop,
-                                   (struct sockaddr_dl *)source_hw->ifa_addr,
+                                   arpop, (struct sockaddr_dl *)(void *)
+                                   source_hw->ifa_addr,
                                    (struct sockaddr *)&source_ip_copy, NULL,
                                    target_proto);
 
@@ -2385,10 +3871,6 @@ ifnet_decr_iorefcnt(struct ifnet *ifp)
         */
        if (ifp->if_refio == 0 && 
                (ifp->if_refflags & IFRF_DETACHING) != 0) {
-               /* Convert the spinlock to a regular mutex if we have
-                * to wait for any reason while doing a wakeup.
-                */
-               lck_mtx_convert_spin(&ifp->if_ref_lock);
                wakeup(&(ifp->if_refio));
        }
        lck_mtx_unlock(&ifp->if_ref_lock);
@@ -2767,6 +4249,9 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        struct ifaddr *ifa;
        struct if_data_internal if_data_saved;
        struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
+       struct dlil_threading_info *dl_inp;
+       u_int32_t sflags = 0;
+       int err;
 
        if (ifp == NULL)
                return (EINVAL);
@@ -2790,7 +4275,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
 
        lck_mtx_lock_spin(&ifp->if_ref_lock);
        if (ifp->if_refflags & IFRF_ATTACHED) {
-               panic("%s: flags mismatch (attached set) ifp=%p",
+               panic_plain("%s: flags mismatch (attached set) ifp=%p",
                    __func__, ifp);
                /* NOTREACHED */
        }
@@ -2856,9 +4341,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
        TAILQ_INIT(&ifp->if_addrhead);
 
-       if (ifp->if_snd.ifq_maxlen == 0)
-               ifp->if_snd.ifq_maxlen = ifqmaxlen;
-
        if (ifp->if_index == 0) {
                int idx = if_next_index();
 
@@ -2905,52 +4387,125 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        /* Hold a reference to the underlying dlil_ifnet */
        ifnet_reference(ifp);
 
+       /* Clear stats (save and restore other fields that we care) */
+       if_data_saved = ifp->if_data;
+       bzero(&ifp->if_data, sizeof (ifp->if_data));
+       ifp->if_data.ifi_type = if_data_saved.ifi_type;
+       ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
+       ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
+       ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
+       ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
+       ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
+       ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
+       ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
+       ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
+       ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
+       ifnet_touch_lastchange(ifp);
+
+       VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
+           ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED);
+
+       /* By default, use SFB and enable flow advisory */
+       sflags = PKTSCHEDF_QALG_SFB;
+       if (if_flowadv)
+               sflags |= PKTSCHEDF_QALG_FLOWCTL;
+
+       /* Initialize transmit queue(s) */
+       err = ifclassq_setup(ifp, sflags, (dl_if->dl_if_flags & DLIF_REUSE));
+       if (err != 0) {
+               panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
+                   "err=%d", __func__, ifp, err);
+               /* NOTREACHED */
+       }
+
+       /* Sanity checks on the input thread storage */
+       dl_inp = &dl_if->dl_if_inpstorage;
+       bzero(&dl_inp->stats, sizeof (dl_inp->stats));
+       VERIFY(dl_inp->input_waiting == 0);
+       VERIFY(dl_inp->wtot == 0);
+       VERIFY(dl_inp->ifp == NULL);
+       VERIFY(qhead(&dl_inp->rcvq_pkts) == NULL && qempty(&dl_inp->rcvq_pkts));
+       VERIFY(qlimit(&dl_inp->rcvq_pkts) == 0);
+       VERIFY(!dl_inp->net_affinity);
+       VERIFY(ifp->if_inp == NULL);
+       VERIFY(dl_inp->input_thr == THREAD_NULL);
+       VERIFY(dl_inp->wloop_thr == THREAD_NULL);
+       VERIFY(dl_inp->poll_thr == THREAD_NULL);
+       VERIFY(dl_inp->tag == 0);
+       VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
+       bzero(&dl_inp->tstats, sizeof (dl_inp->tstats));
+       bzero(&dl_inp->pstats, sizeof (dl_inp->pstats));
+       bzero(&dl_inp->sstats, sizeof (dl_inp->sstats));
+#if IFNET_INPUT_SANITY_CHK
+       VERIFY(dl_inp->input_mbuf_cnt == 0);
+#endif /* IFNET_INPUT_SANITY_CHK */
+
+       /*
+        * A specific DLIL input thread is created per Ethernet/cellular
+        * interface or for an interface which supports opportunistic
+        * input polling.  Pseudo interfaces or other types of interfaces
+        * use the main input thread instead.
+        */
+       if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) ||
+           ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) {
+               ifp->if_inp = dl_inp;
+               err = dlil_create_input_thread(ifp, ifp->if_inp);
+               if (err != 0) {
+                       panic_plain("%s: ifp=%p couldn't get an input thread; "
+                           "err=%d", __func__, ifp, err);
+                       /* NOTREACHED */
+               }
+       }
+
        /*
-        * A specific dlil input thread is created per Ethernet/cellular
-        * interface.  pseudo interfaces or other types of interfaces use
-        * the main ("loopback") thread.
-        *
-        * If the sysctl "net.link.generic.system.multi_threaded_input" is set
-        * to zero, all packets will be handled by the main loopback thread,
-        * reverting to 10.4.x behaviour.
+        * If the driver supports the new transmit model, create a workloop
+        * starter thread to invoke the if_start callback where the packets
+        * may be dequeued and transmitted.
         */
-       if (dlil_multithreaded_input &&
-           (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR)) {
-               int err;
-
-               ifp->if_input_thread = zalloc(dlif_inp_zone);
-               if (ifp->if_input_thread == NULL) {
-                       panic("%s: ifp=%p couldn't alloc threading",
-                           __func__, ifp);
+       if (ifp->if_eflags & IFEF_TXSTART) {
+               VERIFY(ifp->if_start != NULL);
+               VERIFY(ifp->if_start_thread == THREAD_NULL);
+
+               ifnet_set_start_cycle(ifp, NULL);
+               ifp->if_start_active = 0;
+               ifp->if_start_req = 0;
+               if ((err = kernel_thread_start(ifnet_start_thread_fn, ifp,
+                   &ifp->if_start_thread)) != KERN_SUCCESS) {
+                       panic_plain("%s: ifp=%p couldn't get a start thread; "
+                           "err=%d", __func__, ifp, err);
                        /* NOTREACHED */
                }
-               bzero(ifp->if_input_thread, dlif_inp_size);
-               err = dlil_create_input_thread(ifp, ifp->if_input_thread);
-               if (err != 0) {
-                       panic("%s: ifp=%p couldn't get a thread. "
+               ml_thread_policy(ifp->if_start_thread, MACHINE_GROUP,
+                   (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
+       }
+
+       /*
+        * If the driver supports the new receive model, create a poller
+        * thread to invoke if_input_poll callback where the packets may
+        * be dequeued from the driver and processed for reception.
+        */
+       if (ifp->if_eflags & IFEF_RXPOLL) {
+               VERIFY(ifp->if_input_poll != NULL);
+               VERIFY(ifp->if_input_ctl != NULL);
+               VERIFY(ifp->if_poll_thread == THREAD_NULL);
+
+               ifnet_set_poll_cycle(ifp, NULL);
+               ifp->if_poll_update = 0;
+               ifp->if_poll_active = 0;
+               ifp->if_poll_req = 0;
+               if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp,
+                   &ifp->if_poll_thread)) != KERN_SUCCESS) {
+                       panic_plain("%s: ifp=%p couldn't get a poll thread; "
                            "err=%d", __func__, ifp, err);
                        /* NOTREACHED */
                }
-#ifdef DLIL_DEBUG
-               printf("%s: dlil thread for ifp=%p if_index=%d\n",
-                   __func__, ifp, ifp->if_index);
-#endif
+               ml_thread_policy(ifp->if_poll_thread, MACHINE_GROUP,
+                   (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
        }
 
-       /* Clear stats (save and restore other fields that we care) */
-       if_data_saved = ifp->if_data;
-       bzero(&ifp->if_data, sizeof (ifp->if_data));
-       ifp->if_data.ifi_type = if_data_saved.ifi_type;
-       ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
-       ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
-       ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
-       ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
-       ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
-       ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
-       ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
-       ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
-       ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
-       ifnet_touch_lastchange(ifp);
+       VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
+       VERIFY(ifp->if_desc.ifd_len == 0);
+       VERIFY(ifp->if_desc.ifd_desc != NULL);
 
        /* Record attach PC stacktrace */
        ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
@@ -3024,6 +4579,9 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
         */
        lck_mtx_lock(rnh_lock);
        ifnet_lock_exclusive(ifp);
+       /* Initialize Link Quality Metric (loopback [lo0] is always good) */
+       ifp->if_lqm = (ifp == lo_ifp) ? IFNET_LQM_THRESH_GOOD :
+           IFNET_LQM_THRESH_UNKNOWN;
        lck_mtx_lock_spin(&ifp->if_ref_lock);
        ifp->if_refflags = IFRF_ATTACHED;
        lck_mtx_unlock(&ifp->if_ref_lock);
@@ -3113,7 +4671,8 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
                /* address and mask sockaddr_dl locations */
                asdl = (struct sockaddr_dl *)(ifa + 1);
                bzero(asdl, SOCK_MAXADDRLEN);
-               msdl = (struct sockaddr_dl *)((char *)asdl + SOCK_MAXADDRLEN);
+               msdl = (struct sockaddr_dl *)(void *)
+                   ((char *)asdl + SOCK_MAXADDRLEN);
                bzero(msdl, SOCK_MAXADDRLEN);
        } else {
                VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
@@ -3129,9 +4688,9 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
                }
                IFA_LOCK(ifa);
                /* address and mask sockaddr_dl locations */
-               asdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.asdl;
+               asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
                bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl));
-               msdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.msdl;
+               msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
                bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl));
        }
 
@@ -3188,8 +4747,8 @@ ifnet_detach(ifnet_t ifp)
        if (ifp == NULL)
                return (EINVAL);
 
-       ifnet_head_lock_exclusive();
        lck_mtx_lock(rnh_lock);
+       ifnet_head_lock_exclusive();
        ifnet_lock_exclusive(ifp);
 
        /*
@@ -3204,15 +4763,15 @@ ifnet_detach(ifnet_t ifp)
         if (!(ifp->if_refflags & IFRF_ATTACHED)) {
                lck_mtx_unlock(&ifp->if_ref_lock);
                ifnet_lock_done(ifp);
-               lck_mtx_unlock(rnh_lock);
                ifnet_head_done();
+               lck_mtx_unlock(rnh_lock);
                return (EINVAL);
        } else if (ifp->if_refflags & IFRF_DETACHING) {
                /* Interface has already been detached */
                lck_mtx_unlock(&ifp->if_ref_lock);
                ifnet_lock_done(ifp);
-               lck_mtx_unlock(rnh_lock);
                ifnet_head_done();
+               lck_mtx_unlock(rnh_lock);
                return (ENXIO);
        }
        /* Indicate this interface is being detached */
@@ -3237,8 +4796,20 @@ ifnet_detach(ifnet_t ifp)
        ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
 
        ifnet_lock_done(ifp);
-       lck_mtx_unlock(rnh_lock);
        ifnet_head_done();
+       lck_mtx_unlock(rnh_lock);
+
+       /* Reset Link Quality Metric (unless loopback [lo0]) */
+       if (ifp != lo_ifp)
+               if_lqm_update(ifp, IFNET_LQM_THRESH_OFF);
+
+       /* Reset TCP local statistics */
+       if (ifp->if_tcp_stat != NULL)
+               bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
+
+       /* Reset UDP local statistics */
+       if (ifp->if_udp_stat != NULL)
+               bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
 
        /* Let BPF know we're detaching */
        bpfdetach(ifp);
@@ -3246,6 +4817,9 @@ ifnet_detach(ifnet_t ifp)
        /* Mark the interface as DOWN */
        if_down(ifp);
 
+       /* Drain send queue */
+       ifclassq_teardown(ifp);
+
        /* Disable forwarding cached route */
        lck_mtx_lock(&ifp->if_cached_route_lock);
        ifp->if_fwd_cacheok = 0;
@@ -3303,26 +4877,48 @@ ifnet_detaching_dequeue(void)
        return (ifp);
 }
 
-static void
-ifnet_delayed_thread_func(void)
+static int
+ifnet_detacher_thread_cont(int err)
 {
+#pragma unused(err)
        struct ifnet *ifp;
 
        for (;;) {
-               dlil_if_lock();
+               dlil_if_lock_assert();
                while (ifnet_detaching_cnt == 0) {
-                       (void) msleep(&ifnet_delayed_run, &dlil_ifnet_lock,
-                           (PZERO - 1), "ifnet_delayed_thread", NULL);
+                       (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
+                           (PZERO - 1), "ifnet_detacher_cont", 0,
+                           ifnet_detacher_thread_cont);
+                       /* NOTREACHED */
                }
 
                VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
 
                /* Take care of detaching ifnet */
                ifp = ifnet_detaching_dequeue();
-               dlil_if_unlock();
-               if (ifp != NULL)
+               if (ifp != NULL) {
+                       dlil_if_unlock();
                        ifnet_detach_final(ifp);
+                       dlil_if_lock();
+               }
        }
+       /* NOTREACHED */
+       return (0);
+}
+
+static void
+ifnet_detacher_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(v, w)
+       dlil_if_lock();
+       (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
+           (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont);
+       /*
+        * msleep0() shouldn't have returned as PCATCH was not set;
+        * therefore assert in this case.
+        */
+       dlil_if_unlock();
+       VERIFY(0);
 }
 
 static void
@@ -3330,7 +4926,7 @@ ifnet_detach_final(struct ifnet *ifp)
 {
        struct ifnet_filter *filter, *filter_next;
        struct ifnet_filter_head fhead;
-       struct dlil_threading_info *inputthread;
+       struct dlil_threading_info *inp;
        struct ifaddr *ifa;
        ifnet_detached_func if_free;
        int i;
@@ -3342,8 +4938,10 @@ ifnet_detach_final(struct ifnet *ifp)
                /* NOTREACHED */
        }
 
-       /* Wait until the existing IO references get released
-        * before we proceed with ifnet_detach
+       /*
+        * Wait until the existing IO references get released
+        * before we proceed with ifnet_detach.  This is not a
+        * common case, so block without using a continuation.
         */
        while (ifp->if_refio > 0) {
                printf("%s: Waiting for IO references on %s%d interface "
@@ -3420,65 +5018,87 @@ ifnet_detach_final(struct ifnet *ifp)
        /* There should not be any addresses left */
        VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
 
+       /*
+        * Signal the starter thread to terminate itself.
+        */
+       if (ifp->if_start_thread != THREAD_NULL) {
+               lck_mtx_lock_spin(&ifp->if_start_lock);
+               ifp->if_start_thread = THREAD_NULL;
+               wakeup_one((caddr_t)&ifp->if_start_thread);
+               lck_mtx_unlock(&ifp->if_start_lock);
+       }
+
+       /*
+        * Signal the poller thread to terminate itself.
+        */
+       if (ifp->if_poll_thread != THREAD_NULL) {
+               lck_mtx_lock_spin(&ifp->if_poll_lock);
+               ifp->if_poll_thread = THREAD_NULL;
+               wakeup_one((caddr_t)&ifp->if_poll_thread);
+               lck_mtx_unlock(&ifp->if_poll_lock);
+       }
+
        /*
         * If thread affinity was set for the workloop thread, we will need
         * to tear down the affinity and release the extra reference count
-        * taken at attach time;
+        * taken at attach time.  Does not apply to lo0 or other interfaces
+        * without dedicated input threads.
         */
-       if ((inputthread = ifp->if_input_thread) != NULL) {
-               if (inputthread->net_affinity) {
-                       struct thread *tp;
-
-                       if (inputthread == dlil_lo_thread_ptr) {
-                               panic("%s: Thread affinity should not be "
-                                   "enabled on the loopback dlil input "
-                                   "thread", __func__);
-                               /* NOTREACHED */
+       if ((inp = ifp->if_inp) != NULL) {
+               VERIFY(inp != dlil_main_input_thread);
+
+               if (inp->net_affinity) {
+                       struct thread *tp, *wtp, *ptp;
+
+                       lck_mtx_lock_spin(&inp->input_lck);
+                       wtp = inp->wloop_thr;
+                       inp->wloop_thr = THREAD_NULL;
+                       ptp = inp->poll_thr;
+                       inp->poll_thr = THREAD_NULL;
+                       tp = inp->input_thr;    /* don't nullify now */
+                       inp->tag = 0;
+                       inp->net_affinity = FALSE;
+                       lck_mtx_unlock(&inp->input_lck);
+
+                       /* Tear down poll thread affinity */
+                       if (ptp != NULL) {
+                               VERIFY(ifp->if_eflags & IFEF_RXPOLL);
+                               (void) dlil_affinity_set(ptp,
+                                   THREAD_AFFINITY_TAG_NULL);
+                               thread_deallocate(ptp);
                        }
 
-                       lck_mtx_lock_spin(&inputthread->input_lck);
-                       tp = inputthread->workloop_thread;
-                       inputthread->workloop_thread = NULL;
-                       inputthread->tag = 0;
-                       inputthread->net_affinity = FALSE;
-                       lck_mtx_unlock(&inputthread->input_lck);
-
                        /* Tear down workloop thread affinity */
-                       if (tp != NULL) {
-                               (void) dlil_affinity_set(tp,
+                       if (wtp != NULL) {
+                               (void) dlil_affinity_set(wtp,
                                    THREAD_AFFINITY_TAG_NULL);
-                               thread_deallocate(tp);
+                               thread_deallocate(wtp);
                        }
 
-                       /* Tear down dlil input thread affinity */
-                       tp = inputthread->input_thread;
+                       /* Tear down DLIL input thread affinity */
                        (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
                        thread_deallocate(tp);
                }
 
-               /* cleanup ifp dlil input thread, if any */
-               ifp->if_input_thread = NULL;
-
-               if (inputthread != dlil_lo_thread_ptr) {
-#ifdef DLIL_DEBUG
-                       printf("%s: wakeup thread threadinfo: %p "
-                           "input_thread=%p threads: cur=%d max=%d\n",
-                           __func__, inputthread, inputthread->input_thread,
-                           dlil_multithreaded_input, cur_dlil_input_threads);
-#endif
-                       lck_mtx_lock_spin(&inputthread->input_lck);
-
-                       inputthread->input_waiting |= DLIL_INPUT_TERMINATE;
-                       if (!(inputthread->input_waiting & DLIL_INPUT_RUNNING))
-                               wakeup((caddr_t)&inputthread->input_waiting);
+               /* disassociate ifp DLIL input thread */
+               ifp->if_inp = NULL;
 
-                       lck_mtx_unlock(&inputthread->input_lck);
+               lck_mtx_lock_spin(&inp->input_lck);
+               inp->input_waiting |= DLIL_INPUT_TERMINATE;
+               if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
+                       wakeup_one((caddr_t)&inp->input_waiting);
                }
+               lck_mtx_unlock(&inp->input_lck);
        }
 
        /* The driver might unload, so point these to ourselves */
        if_free = ifp->if_free;
        ifp->if_output = ifp_if_output;
+       ifp->if_pre_enqueue = ifp_if_output;
+       ifp->if_start = ifp_if_start;
+       ifp->if_output_ctl = ifp_if_ctl;
+       ifp->if_input_poll = ifp_if_input_poll;
+       ifp->if_input_ctl = ifp_if_ctl;
        ifp->if_ioctl = ifp_if_ioctl;
        ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
        ifp->if_free = ifp_if_free;
@@ -3489,6 +5109,12 @@ ifnet_detach_final(struct ifnet *ifp)
        ifp->if_del_proto = ifp_if_del_proto;
        ifp->if_check_multi = ifp_if_check_multi;
 
+       /* wipe out interface description */
+       VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
+       ifp->if_desc.ifd_len = 0;
+       VERIFY(ifp->if_desc.ifd_desc != NULL);
+       bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
+
        ifnet_lock_done(ifp);
 
 #if PF
@@ -3505,6 +5131,9 @@ ifnet_detach_final(struct ifnet *ifp)
        VERIFY(ifp->if_flt_waiters == 0);
        lck_mtx_unlock(&ifp->if_flt_lock);
 
+       /* Last chance to drain send queue */
+       if_qflush(ifp, 0);
+
        /* Last chance to cleanup any cached route */
        lck_mtx_lock(&ifp->if_cached_route_lock);
        VERIFY(!ifp->if_fwd_cacheok);
@@ -3553,6 +5182,34 @@ ifp_if_output(struct ifnet *ifp, struct mbuf *m)
        return (0);
 }
 
+static void
+ifp_if_start(struct ifnet *ifp)
+{
+       ifnet_purge(ifp);
+}
+
+static void
+ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
+    struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
+{
+#pragma unused(ifp, flags, max_cnt)
+       if (m_head != NULL)
+               *m_head = NULL;
+       if (m_tail != NULL)
+               *m_tail = NULL;
+       if (cnt != NULL)
+               *cnt = 0;
+       if (len != NULL)
+               *len = 0;
+}
+
+static errno_t
+ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
+{
+#pragma unused(ifp, cmd, arglen, arg)
+       return (EOPNOTSUPP);
+}
+
 static errno_t
 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
 {
@@ -3583,17 +5240,25 @@ ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
        return (EOPNOTSUPP);
 }
 
-static errno_t
-ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
-    const struct sockaddr *sa, const char *ll, const char *t)
+static errno_t ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
+const struct sockaddr *sa, const char *ll, const char *t
+#if CONFIG_EMBEDDED
+               ,
+               u_int32_t *pre, u_int32_t *post
+#endif /* CONFIG_EMBEDDED */
+                                                        )
 {
 #pragma unused(ifp, m, sa, ll, t)
        m_freem(*m);
        *m = NULL;
+#if CONFIG_EMBEDDED
+       *pre = 0;
+       *post = 0;
+#endif /* CONFIG_EMBEDDED */
        return (EJUSTRETURN);
 }
 
-static errno_t
+errno_t
 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
 {
 #pragma unused(ifp, cmd, arg)
@@ -3697,20 +5362,43 @@ int dlil_if_acquire(u_int32_t family, const void *uniqueid,
                dlifp1->dl_if_trace = dlil_if_trace;
        }
        ifp1->if_name = dlifp1->dl_if_namestorage;
+
+       /* initialize interface description */
+       ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
+       ifp1->if_desc.ifd_len = 0;
+       ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
+
 #if CONFIG_MACF_NET
        mac_ifnet_label_init(ifp1);
 #endif
 
+       if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
+               DLIL_PRINTF("%s: failed to allocate if local stats, "
+                   "error: %d\n", __func__, ret);
+               /* This probably shouldn't be fatal */
+               ret = 0;
+       }
+
        lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr);
        lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr);
        lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr);
        lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr);
-       lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_lock_group,
-           ifnet_lock_attr);
        lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group,
            ifnet_lock_attr);
        lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr);
 
+       /* for send data paths */
+       lck_mtx_init(&ifp1->if_start_lock, ifnet_snd_lock_group,
+           ifnet_lock_attr);
+       lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_snd_lock_group,
+           ifnet_lock_attr);
+       lck_mtx_init(&ifp1->if_snd.ifcq_lock, ifnet_snd_lock_group,
+           ifnet_lock_attr);
+
+       /* for receive data paths */
+       lck_mtx_init(&ifp1->if_poll_lock, ifnet_rcv_lock_group,
+           ifnet_lock_attr);
+
        TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
 
        *ifp = ifp1;
@@ -3846,7 +5534,9 @@ struct rtentry *
 ifnet_cached_rtlookup_inet(struct ifnet        *ifp, struct in_addr src_ip)
 {
        struct route            src_rt;
-       struct sockaddr_in      *dst = (struct sockaddr_in *)(&src_rt.ro_dst);
+       struct sockaddr_in      *dst;
+
+       dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
 
        ifp_src_route_copyout(ifp, &src_rt);
 
@@ -3900,7 +5590,8 @@ ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
                        src_rt.ro_dst.sin6_family = AF_INET6;
                }
                src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
-               src_rt.ro_dst.sin6_addr = *src_ip6;
+               bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
+                   sizeof (src_rt.ro_dst.sin6_addr));
 
                if (src_rt.ro_rt == NULL) {
                        src_rt.ro_rt = rtalloc1_scoped(
@@ -3920,3 +5611,394 @@ ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
        return (src_rt.ro_rt);
 }
 #endif /* INET6 */
+
+void
+if_lqm_update(struct ifnet *ifp, int lqm)
+{
+       struct kev_dl_link_quality_metric_data ev_lqm_data;
+
+       VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
+
+       /* Normalize to edge */
+       if (lqm > IFNET_LQM_THRESH_UNKNOWN && lqm <= IFNET_LQM_THRESH_POOR)
+               lqm = IFNET_LQM_THRESH_POOR;
+       else if (lqm > IFNET_LQM_THRESH_POOR && lqm <= IFNET_LQM_THRESH_GOOD)
+               lqm = IFNET_LQM_THRESH_GOOD;
+
+       ifnet_lock_exclusive(ifp);
+       if (lqm == ifp->if_lqm) {
+               ifnet_lock_done(ifp);
+               return;         /* nothing to update */
+       }
+       ifp->if_lqm = lqm;
+       ifnet_lock_done(ifp);
+
+       bzero(&ev_lqm_data, sizeof (ev_lqm_data));
+       ev_lqm_data.link_quality_metric = lqm;
+
+       dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
+           (struct net_event_data *)&ev_lqm_data, sizeof (ev_lqm_data));
+}
+
+/* for uuid.c */
+int
+uuid_get_ethernet(u_int8_t *node)
+{
+       struct ifnet *ifp;
+       struct sockaddr_dl *sdl;
+
+       ifnet_head_lock_shared();
+       TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
+               ifnet_lock_shared(ifp);
+               IFA_LOCK_SPIN(ifp->if_lladdr);
+               sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr;
+               if (sdl->sdl_type == IFT_ETHER) {
+                       memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN);
+                       IFA_UNLOCK(ifp->if_lladdr);
+                       ifnet_lock_done(ifp);
+                       ifnet_head_done();
+                       return (0);
+               }
+               IFA_UNLOCK(ifp->if_lladdr);
+               ifnet_lock_done(ifp);
+       }
+       ifnet_head_done();
+
+       return (-1);
+}
+
+static int
+sysctl_rxpoll SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int i, err;
+
+       i = if_rxpoll;
+
+       err = sysctl_handle_int(oidp, &i, 0, req);
+       if (err != 0 || req->newptr == USER_ADDR_NULL)
+               return (err);
+
+       if (net_rxpoll == 0)
+               return (ENXIO);
+
+       if_rxpoll = i;
+       return (err);
+}
+
+static int
+sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int i, err;
+
+       i = if_sndq_maxlen;
+
+       err = sysctl_handle_int(oidp, &i, 0, req);
+       if (err != 0 || req->newptr == USER_ADDR_NULL)
+               return (err);
+
+       if (i < IF_SNDQ_MINLEN)
+               i = IF_SNDQ_MINLEN;
+
+       if_sndq_maxlen = i;
+       return (err);
+}
+
+static int
+sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int i, err;
+
+       i = if_rcvq_maxlen;
+
+       err = sysctl_handle_int(oidp, &i, 0, req);
+       if (err != 0 || req->newptr == USER_ADDR_NULL)
+               return (err);
+
+       if (i < IF_RCVQ_MINLEN)
+               i = IF_RCVQ_MINLEN;
+
+       if_rcvq_maxlen = i;
+       return (err);
+}
+
+void
+ifnet_fclist_append(struct sfb *sp, struct sfb_fc_list *fcl)
+{
+       struct sfb_bin_fcentry *fce, *tfce;
+
+       lck_mtx_lock_spin(&ifnet_fclist_lock);
+
+       SLIST_FOREACH_SAFE(fce, fcl, fce_link, tfce) {
+               SLIST_REMOVE(fcl, fce, sfb_bin_fcentry, fce_link);
+               SLIST_INSERT_HEAD(&ifnet_fclist, fce, fce_link);
+               sp->sfb_stats.flow_feedback++;
+       }
+       VERIFY(SLIST_EMPTY(fcl) && !SLIST_EMPTY(&ifnet_fclist));
+
+       wakeup(&ifnet_fclist);
+
+       lck_mtx_unlock(&ifnet_fclist_lock);
+}
+
+struct sfb_bin_fcentry *
+ifnet_fce_alloc(int how)
+{
+       struct sfb_bin_fcentry *fce;
+
+       fce = (how == M_WAITOK) ? zalloc(ifnet_fcezone) :
+           zalloc_noblock(ifnet_fcezone);
+       if (fce != NULL)
+               bzero(fce, ifnet_fcezone_size);
+
+       return (fce);
+}
+
+void
+ifnet_fce_free(struct sfb_bin_fcentry *fce)
+{
+       zfree(ifnet_fcezone, fce);
+}
+
+static void
+ifnet_fc_init(void)
+{
+       thread_t thread = THREAD_NULL;
+
+       SLIST_INIT(&ifnet_fclist);
+       lck_mtx_init(&ifnet_fclist_lock, ifnet_snd_lock_group, NULL);
+
+       ifnet_fcezone_size = P2ROUNDUP(sizeof (struct sfb_bin_fcentry),
+           sizeof (u_int64_t));
+       ifnet_fcezone = zinit(ifnet_fcezone_size,
+           IFNET_FCEZONE_MAX * ifnet_fcezone_size, 0, IFNET_FCEZONE_NAME);
+       if (ifnet_fcezone == NULL) {
+               panic("%s: failed allocating %s", __func__, IFNET_FCEZONE_NAME);
+               /* NOTREACHED */
+       }
+       zone_change(ifnet_fcezone, Z_EXPAND, TRUE);
+       zone_change(ifnet_fcezone, Z_CALLERACCT, FALSE);
+
+       if (kernel_thread_start(ifnet_fc_thread_func,
+           NULL, &thread) != KERN_SUCCESS) {
+               panic("%s: couldn't create flow event advisory thread",
+                   __func__);
+               /* NOTREACHED */
+       }
+       thread_deallocate(thread);
+}
+
+static int
+ifnet_fc_thread_cont(int err)
+{
+#pragma unused(err)
+       struct sfb_bin_fcentry *fce;
+       struct inp_fc_entry *infc;
+
+       for (;;) {
+               lck_mtx_assert(&ifnet_fclist_lock, LCK_MTX_ASSERT_OWNED);
+               while (SLIST_EMPTY(&ifnet_fclist)) {
+                       (void) msleep0(&ifnet_fclist, &ifnet_fclist_lock,
+                           (PSOCK | PSPIN), "ifnet_fc_cont", 0,
+                           ifnet_fc_thread_cont);
+                       /* NOTREACHED */
+               }
+
+               fce = SLIST_FIRST(&ifnet_fclist);
+               SLIST_REMOVE(&ifnet_fclist, fce, sfb_bin_fcentry, fce_link);
+               SLIST_NEXT(fce, fce_link) = NULL;
+               lck_mtx_unlock(&ifnet_fclist_lock);
+
+               infc = inp_fc_getinp(fce->fce_flowhash);
+               if (infc == NULL) {
+                       ifnet_fce_free(fce);
+                       lck_mtx_lock_spin(&ifnet_fclist_lock);
+                       continue;
+               }
+               VERIFY(infc->infc_inp != NULL);
+
+               inp_fc_feedback(infc->infc_inp);
+
+               inp_fc_entry_free(infc);
+               ifnet_fce_free(fce);
+               lck_mtx_lock_spin(&ifnet_fclist_lock);
+       }
+}
+
+static void
+ifnet_fc_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(v, w)
+       lck_mtx_lock(&ifnet_fclist_lock);
+       (void) msleep0(&ifnet_fclist, &ifnet_fclist_lock,
+           (PSOCK | PSPIN), "ifnet_fc", 0, ifnet_fc_thread_cont);
+       /*
+        * msleep0() shouldn't have returned as PCATCH was not set;
+        * therefore assert in this case.
+        */
+       lck_mtx_unlock(&ifnet_fclist_lock);
+       VERIFY(0);
+}
+
+void
+dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
+    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
+{
+       struct kev_dl_node_presence kev;
+       struct sockaddr_dl *sdl;
+       struct sockaddr_in6 *sin6;
+
+       VERIFY(ifp);
+       VERIFY(sa);
+       VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
+
+       bzero(&kev, sizeof (kev));
+       sin6 = &kev.sin6_node_address;
+       sdl = &kev.sdl_node_address;
+       nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
+       kev.rssi = rssi;
+       kev.link_quality_metric = lqm;
+       kev.node_proximity_metric = npm;
+       bcopy(srvinfo, kev.node_service_info, sizeof (kev.node_service_info));
+
+       nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
+       dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
+           &kev.link_data, sizeof (kev));
+}
+
+void
+dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
+{
+       struct kev_dl_node_absence kev;
+       struct sockaddr_in6 *sin6;
+       struct sockaddr_dl *sdl;
+
+       VERIFY(ifp);
+       VERIFY(sa);
+       VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
+
+       bzero(&kev, sizeof (kev));
+       sin6 = &kev.sin6_node_address;
+       sdl = &kev.sdl_node_address;
+       nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
+
+       nd6_alt_node_absent(ifp, sin6);
+       dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
+           &kev.link_data, sizeof (kev));
+}
+
+errno_t
+ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
+    struct proc *p)
+{
+       u_int32_t level = IFNET_THROTTLE_OFF;
+       errno_t result = 0;
+
+       VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
+
+       if (cmd == SIOCSIFOPPORTUNISTIC) {
+               /*
+                * XXX: Use priv_check_cred() instead of root check?
+                */
+               if ((result = proc_suser(p)) != 0)
+                       return (result);
+
+               if (ifr->ifr_opportunistic.ifo_flags ==
+                   IFRIFOF_BLOCK_OPPORTUNISTIC)
+                       level = IFNET_THROTTLE_OPPORTUNISTIC;
+               else if (ifr->ifr_opportunistic.ifo_flags == 0)
+                       level = IFNET_THROTTLE_OFF;
+               else
+                       result = EINVAL;
+
+               if (result == 0)
+                       result = ifnet_set_throttle(ifp, level);
+       } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
+               ifr->ifr_opportunistic.ifo_flags = 0;
+               if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
+                       ifr->ifr_opportunistic.ifo_flags |=
+                           IFRIFOF_BLOCK_OPPORTUNISTIC;
+               }
+       }
+
+       /*
+        * Return the count of current opportunistic connections
+        * over the interface.
+        */
+       if (result == 0) {
+               uint32_t flags = 0;
+               flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
+                       INPCB_OPPORTUNISTIC_SETCMD : 0;
+               flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ? 
+                       INPCB_OPPORTUNISTIC_THROTTLEON : 0;
+               ifr->ifr_opportunistic.ifo_inuse =
+                   udp_count_opportunistic(ifp->if_index, flags) +
+                   tcp_count_opportunistic(ifp->if_index, flags);
+       }
+
+       if (result == EALREADY)
+               result = 0;
+
+       return (result);
+}
+
+int
+ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
+{
+       struct ifclassq *ifq;
+       int err = 0;
+
+       if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       *level = IFNET_THROTTLE_OFF;
+
+       ifq = &ifp->if_snd;
+       IFCQ_LOCK(ifq);
+       /* Throttling works only for IFCQ, not ALTQ instances */
+       if (IFCQ_IS_ENABLED(ifq))
+               IFCQ_GET_THROTTLE(ifq, *level, err);
+       IFCQ_UNLOCK(ifq);
+
+       return (err);
+}
+
+int
+ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
+{
+       struct ifclassq *ifq;
+       int err = 0;
+
+       if (!(ifp->if_eflags & IFEF_TXSTART))
+               return (ENXIO);
+
+       switch (level) {
+       case IFNET_THROTTLE_OFF:
+       case IFNET_THROTTLE_OPPORTUNISTIC:
+#if PF_ALTQ
+               /* Throttling works only for IFCQ, not ALTQ instances */
+               if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq)))
+                       return (ENXIO);
+#endif /* PF_ALTQ */
+               break;
+       default:
+               return (EINVAL);
+       }
+
+       ifq = &ifp->if_snd;
+       IFCQ_LOCK(ifq);
+       if (IFCQ_IS_ENABLED(ifq))
+               IFCQ_SET_THROTTLE(ifq, level, err);
+       IFCQ_UNLOCK(ifq);
+
+       if (err == 0) {
+               printf("%s%d: throttling level set to %d\n", ifp->if_name,
+                   ifp->if_unit, level);
+               if (level == IFNET_THROTTLE_OFF)
+                       ifnet_start(ifp);
+       }
+
+       return (err);
+}