xnu-344.tar.gz
[apple/xnu.git] / bsd / netinet / ip_output.c
index 7f74f5998964012f9623bdd34d78b0e68baf54ec..eaff4fc2d28b2645902a94ff83a76cd6c46c6de7 100644 (file)
  * SUCH DAMAGE.
  *
  *     @(#)ip_output.c 8.3 (Berkeley) 1/21/94
+ * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
  */
 
 #define _IP_VHL
 
-#if ISFB31
-#include "opt_ipfw.h"
-#include "opt_ipdn.h"
-#include "opt_ipdivert.h"
-#include "opt_ipfilter.h"
-#endif
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
-#if INET6
-#include <netinet/ip6.h>
-#include <netinet6/ip6_var.h>
-#endif
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
-#include <net/dlil.h>
 
+#include "faith.h"
+
+#include <net/dlil.h>
 #include <sys/kdebug.h>
 
 #define DBG_LAYER_BEG          NETDBG_CODE(DBG_NETIP, 1)
 #define DBG_FNC_IP_OUTPUT      NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 
 
-#ifdef vax
+#if vax
 #include <machine/mtpr.h>
 #endif
 
-#if ISFB31
+#if __FreeBSD__
 #include <machine/in_cksum.h>
 
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
 #endif
 
-//static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
-
 #if IPSEC
 #include <netinet6/ipsec.h>
 #include <netkey/key.h>
+#if IPSEC_DEBUG
 #include <netkey/key_debug.h>
-
-#endif /*IPSEC*/
-
-#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1
-#undef COMPAT_IPFW
-#define COMPAT_IPFW 1
 #else
-#undef COMPAT_IPFW
+#define        KEYDEBUG(lev,arg)
 #endif
+#endif /*IPSEC*/
 
-#if COMPAT_IPFW
 #include <netinet/ip_fw.h>
-#endif
 
 #if DUMMYNET
 #include <netinet/ip_dummynet.h>
@@ -138,6 +122,7 @@ static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
 u_short ip_id;
 
 static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *));
+static struct ifnet *ip_multicast_if __P((struct in_addr *, int *));
 static void    ip_mloopback
        __P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int));
 static int     ip_getmoptions
@@ -145,18 +130,27 @@ static int        ip_getmoptions
 static int     ip_pcbopts __P((int, struct mbuf **, struct mbuf *));
 static int     ip_setmoptions
        __P((struct sockopt *, struct ip_moptions **));
-static u_long  lo_dl_tag = 0;
 
-#if IPFILTER_LKM || IPFILTER
 int    ip_optcopy __P((struct ip *, struct ip *));
 extern int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **));
-#else
-static int     ip_optcopy __P((struct ip *, struct ip *));
+#ifdef __APPLE__
+extern struct mbuf*    m_dup(register struct mbuf *m, int how);
 #endif
 
+static u_long  lo_dl_tag = 0;
+
+void in_delayed_cksum(struct mbuf *m);
+extern int apple_hwcksum_tx;
 
 extern struct protosw inetsw[];
 
+extern struct ip_linklocal_stat ip_linklocal_stat;
+
+/* temporary: for testing */
+#if IPSEC
+extern int ipsec_bypass;
+#endif
+
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
@@ -178,52 +172,29 @@ ip_output(m0, opt, ro, flags, imo)
        int hlen = sizeof (struct ip);
        int len, off, error = 0;
        struct sockaddr_in *dst;
-       struct in_ifaddr *ia;
-       int isbroadcast;
+       struct in_ifaddr *ia = NULL;
+       int isbroadcast, sw_csum;
 #if IPSEC
        struct route iproute;
-       struct socket *so;
+       struct socket *so = NULL;
        struct secpolicy *sp = NULL;
 #endif
+       u_int16_t divert_cookie;                /* firewall cookie */
 #if IPFIREWALL_FORWARD
        int fwd_rewrite_src = 0;
 #endif
-
-
-#if !IPDIVERT /* dummy variable for the firewall code to play with */
-        u_short ip_divert_cookie = 0 ;
-#endif
-#if COMPAT_IPFW
-       struct ip_fw_chain *rule = NULL ;
+       struct ip_fw_chain *rule = NULL;
+  
+#if IPDIVERT
+       /* Get and reset firewall cookie */
+       divert_cookie = ip_divert_cookie;
+       ip_divert_cookie = 0;
+#else
+       divert_cookie = 0;
 #endif
 
        KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 
-#if IPSEC
-       /*
-        * NOTE: m->m_pkthdr is NULL cleared below just to prevent ipfw code
-        * from SEGV.
-        * ipfw code uses rcvif to determine incoming interface, and
-        * KAME uses rcvif for ipsec processing.
-        * ipfw may not be working right with KAME at this moment.
-        * We need more tests.
-        */
-#if DUMMYNET
-       if (m->m_type == MT_DUMMYNET) {
-               if (m->m_next != NULL) {
-                       so = (struct socket *)m->m_next->m_pkthdr.rcvif;
-                       m->m_next->m_pkthdr.rcvif = NULL;
-               } else
-                       so = NULL;
-       } else
-#endif
-       {
-               so = ipsec_getsocket(m);
-               ipsec_setsocket(m, NULL);
-       }
-#endif /*IPSEC*/
-
-
 #if IPFIREWALL && DUMMYNET
         /*  
          * dummynet packet are prepended a vestigial mbuf with
@@ -231,26 +202,39 @@ ip_output(m0, opt, ro, flags, imo)
          * rule.
          */ 
         if (m->m_type == MT_DUMMYNET) {
-            struct mbuf *tmp_m = m ;
             /*
              * the packet was already tagged, so part of the
              * processing was already done, and we need to go down.
-             * opt, flags and imo have already been used, and now
-             * they are used to hold ifp and hlen and NULL, respectively.
+             * Get parameters from the header.
              */
-            rule = (struct ip_fw_chain *)(m->m_data) ;
-            m = m->m_next ;
-            FREE(tmp_m, M_IPFW);
+                       rule = (struct ip_fw_chain *)(m->m_data) ;
+                       opt = NULL ;
+                       ro = & ( ((struct dn_pkt *)m)->ro ) ;
+                       imo = NULL ;
+                       dst = ((struct dn_pkt *)m)->dn_dst ;
+                       ifp = ((struct dn_pkt *)m)->ifp ;
+                       flags = ((struct dn_pkt *)m)->flags ;
+                       m0 = m = m->m_next ;
+#if IPSEC
+           if (ipsec_bypass == 0) {
+               so = ipsec_getsocket(m);
+               (void)ipsec_setsocket(m, NULL);
+           }
+#endif
             ip = mtod(m, struct ip *);
-            dst = (struct sockaddr_in *)&ro->ro_dst;
-            ifp = (struct ifnet *)opt;
             hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
-            opt = NULL ;
-            flags = 0 ; /* XXX is this correct ? */
+            if (ro->ro_rt != NULL)
+                ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
             goto sendit;
         } else
             rule = NULL ;
 #endif
+#if IPSEC
+       if (ipsec_bypass == 0) {
+               so = ipsec_getsocket(m);
+               (void)ipsec_setsocket(m, NULL);
+       }
+#endif
 
 #if    DIAGNOSTIC
        if ((m->m_flags & M_PKTHDR) == 0)
@@ -270,7 +254,11 @@ ip_output(m0, opt, ro, flags, imo)
        if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
                ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
                ip->ip_off &= IP_DF;
+#if RANDOM_IP_ID
+               ip->ip_id = ip_randomid();
+#else
                ip->ip_id = htons(ip_id++);
+#endif
                ipstat.ips_localout++;
        } else {
                hlen = IP_VHL_HL(ip->ip_vhl) << 2;
@@ -287,7 +275,7 @@ ip_output(m0, opt, ro, flags, imo)
         */
        if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
           dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
-               RTFREE(ro->ro_rt);
+               rtfree(ro->ro_rt);
                ro->ro_rt = (struct rtentry *)0;
        }
        if (ro->ro_rt == 0) {
@@ -355,8 +343,10 @@ ip_output(m0, opt, ro, flags, imo)
                 */
                if (imo != NULL) {
                        ip->ip_ttl = imo->imo_multicast_ttl;
-                       if (imo->imo_multicast_ifp != NULL)
+                       if (imo->imo_multicast_ifp != NULL) {
                                ifp = imo->imo_multicast_ifp;
+                               dl_tag = ifp->if_data.default_proto;
+                       }
                        if (imo->imo_multicast_vif != -1)
                                ip->ip_src.s_addr =
                                    ip_mcast_src(imo->imo_multicast_vif);
@@ -379,8 +369,7 @@ ip_output(m0, opt, ro, flags, imo)
                if (ip->ip_src.s_addr == INADDR_ANY) {
                        register struct in_ifaddr *ia1;
 
-                       for (ia1 = in_ifaddrhead.tqh_first; ia1;
-                            ia1 = ia1->ia_link.tqe_next)
+                       TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link)
                                if (ia1->ia_ifp == ifp) {
                                        ip->ip_src = IA_SIN(ia1)->sin_addr;
                                        break;
@@ -493,6 +482,150 @@ ip_output(m0, opt, ro, flags, imo)
        }
 
 sendit:
+        /*
+         * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
+         */
+        if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
+               ip_linklocal_stat.iplls_out_total++;
+               if (ip->ip_ttl != MAXTTL) {
+                       ip_linklocal_stat.iplls_out_badttl++;
+                       ip->ip_ttl = MAXTTL;
+               }
+        }
+
+#if IPSEC
+       /* temporary for testing only: bypass ipsec alltogether */
+
+       if (ipsec_bypass != 0)
+               goto skip_ipsec;
+
+       /* get SP for this packet */
+       if (so == NULL)
+               sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
+       else
+               sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
+
+       if (sp == NULL) {
+               ipsecstat.out_inval++;
+               goto bad;
+       }
+
+       error = 0;
+
+       /* check policy */
+       switch (sp->policy) {
+       case IPSEC_POLICY_DISCARD:
+               /*
+                * This packet is just discarded.
+                */
+               ipsecstat.out_polvio++;
+               goto bad;
+
+       case IPSEC_POLICY_BYPASS:
+       case IPSEC_POLICY_NONE:
+               /* no need to do IPsec. */
+               goto skip_ipsec;
+       
+       case IPSEC_POLICY_IPSEC:
+               if (sp->req == NULL) {
+                       /* acquire a policy */
+                       error = key_spdacquire(sp);
+                       goto bad;
+               }
+               break;
+
+       case IPSEC_POLICY_ENTRUST:
+       default:
+               printf("ip_output: Invalid policy found. %d\n", sp->policy);
+       }
+    {
+       struct ipsec_output_state state;
+       bzero(&state, sizeof(state));
+       state.m = m;
+       if (flags & IP_ROUTETOIF) {
+               state.ro = &iproute;
+               bzero(&iproute, sizeof(iproute));
+       } else
+               state.ro = ro;
+       state.dst = (struct sockaddr *)dst;
+
+       ip->ip_sum = 0;
+
+       /*
+        * XXX
+        * delayed checksums are not currently compatible with IPsec
+        */
+       if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+               in_delayed_cksum(m);
+               m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+       }
+
+       HTONS(ip->ip_len);
+       HTONS(ip->ip_off);
+
+       error = ipsec4_output(&state, sp, flags);
+
+       m = state.m;
+       if (flags & IP_ROUTETOIF) {
+               /*
+                * if we have tunnel mode SA, we may need to ignore
+                * IP_ROUTETOIF.
+                */
+               if (state.ro != &iproute || state.ro->ro_rt != NULL) {
+                       flags &= ~IP_ROUTETOIF;
+                       ro = state.ro;
+               }
+       } else
+               ro = state.ro;
+       dst = (struct sockaddr_in *)state.dst;
+       if (error) {
+               /* mbuf is already reclaimed in ipsec4_output. */
+               m0 = NULL;
+               switch (error) {
+               case EHOSTUNREACH:
+               case ENETUNREACH:
+               case EMSGSIZE:
+               case ENOBUFS:
+               case ENOMEM:
+                       break;
+               default:
+                       printf("ip4_output (ipsec): error code %d\n", error);
+                       /*fall through*/
+               case ENOENT:
+                       /* don't show these error codes to the user */
+                       error = 0;
+                       break;
+               }
+               goto bad;
+       }
+    }
+
+       /* be sure to update variables that are affected by ipsec4_output() */
+       ip = mtod(m, struct ip *);
+#ifdef _IP_VHL
+       hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+#else
+       hlen = ip->ip_hl << 2;
+#endif
+       if (ro->ro_rt == NULL) {
+               if ((flags & IP_ROUTETOIF) == 0) {
+                       printf("ip_output: "
+                               "can't update route after IPsec processing\n");
+                       error = EHOSTUNREACH;   /*XXX*/
+                       goto bad;
+               }
+       } else {
+               ia = ifatoia(ro->ro_rt->rt_ifa);
+               ifp = ro->ro_rt->rt_ifp;
+               dl_tag = ia->ia_ifa.ifa_dlt;
+       }
+
+       /* make it flipped, again. */
+       NTOHS(ip->ip_len);
+       NTOHS(ip->ip_off);
+skip_ipsec:
+#endif /*IPSEC*/
+
        /*
         * IpHack's section.
         * - Xlate: translate packet's addr/port (NAT).
@@ -500,7 +633,6 @@ sendit:
         * - Wrap: fake packet's addr/port <unimpl.>
         * - Encapsulate: put it in another IP and send out. <unimp.>
         */ 
-#if IPFILTER || IPFILTER_LKM
        if (fr_checkp) {
                struct  mbuf    *m1 = m;
 
@@ -508,27 +640,21 @@ sendit:
                        goto done;
                ip = mtod(m = m1, struct ip *);
        }
-#endif
-
-#if COMPAT_IPFW
-        if (ip_nat_ptr && !(*ip_nat_ptr)(&ip, &m, ifp, IP_NAT_OUT)) {
-               error = EACCES; 
-               goto done;
-       }
 
        /*
         * Check with the firewall...
         */
-       if (ip_fw_chk_ptr) {
+       if (fw_enable && ip_fw_chk_ptr) {
                struct sockaddr_in *old = dst;
 
                off = (*ip_fw_chk_ptr)(&ip,
-                   hlen, ifp, &ip_divert_cookie, &m, &rule, &dst);
+                   hlen, ifp, &divert_cookie, &m, &rule, &dst);
                 /*
                  * On return we must do the following:
-                 * m == NULL         -> drop the pkt
+                 * IP_FW_PORT_DENY_FLAG                -> drop the pkt (XXX new)
                  * 1<=off<= 0xffff   -> DIVERT
-                 * (off & 0x10000)   -> send to a DUMMYNET pipe
+                 * (off & IP_FW_PORT_DYNT_FLAG)        -> send to a DUMMYNET pipe
+                 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
                  * dst != old        -> IPFIREWALL_FORWARD
                  * off==0, dst==old  -> accept
                  * If some of the above modules is not compiled in, then
@@ -537,31 +663,62 @@ sendit:
                  * unsupported rules), but better play safe and drop
                  * packets in case of doubt.
                  */
-               if (!m) { /* firewall said to reject */
-                       error = EACCES;
-                       goto done;
+               if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
+                       if (m)
+                               m_freem(m);
+                       error = EACCES ;
+                       goto done ;
                }
+               ip = mtod(m, struct ip *);
                if (off == 0 && dst == old) /* common case */
                        goto pass ;
 #if DUMMYNET
-                if (off & 0x10000) {  
+                if ((off & IP_FW_PORT_DYNT_FLAG) != 0) {
                     /*
                      * pass the pkt to dummynet. Need to include
-                     * pipe number, m, ifp, ro, hlen because these are
+                     * pipe number, m, ifp, ro, dst because these are
                      * not recomputed in the next pass.
                      * All other parameters have been already used and
                      * so they are not needed anymore. 
                      * XXX note: if the ifp or ro entry are deleted
                      * while a pkt is in dummynet, we are in trouble!
                      */ 
-                    dummynet_io(off & 0xffff, DN_TO_IP_OUT, m,ifp,ro,hlen,rule);
-                       goto done;
+                   error = dummynet_io(off & 0xffff, DN_TO_IP_OUT, m,
+                               ifp,ro,dst,rule, flags);
+                   goto done;
                }
 #endif   
 #if IPDIVERT
-                if (off > 0 && off < 0x10000) {         /* Divert packet */
-                       ip_divert_port = off & 0xffff ;
-                       (*ip_protox[IPPROTO_DIVERT]->pr_input)(m, 0);
+               if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
+                       struct mbuf *clone = NULL;
+
+                       /* Clone packet if we're doing a 'tee' */
+                       if ((off & IP_FW_PORT_TEE_FLAG) != 0)
+                               clone = m_dup(m, M_DONTWAIT);
+                       /*
+                        * XXX
+                        * delayed checksums are not currently compatible
+                        * with divert sockets.
+                        */
+                       if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+                               in_delayed_cksum(m);
+                               m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+                       }
+
+                       /* Restore packet header fields to original values */
+                       HTONS(ip->ip_len);
+                       HTONS(ip->ip_off);
+
+                       /* Deliver packet to divert input routine */
+                       ip_divert_cookie = divert_cookie;
+                       divert_packet(m, 0, off & 0xffff);
+
+                       /* If 'tee', continue with original packet */
+                       if (clone != NULL) {
+                               m = clone;
+                               ip = mtod(m, struct ip *);
+                               goto pass;
+                       }
                        goto done;
                }
 #endif
@@ -600,8 +757,7 @@ sendit:
                         * as the packet runs through ip_input() as
                         * it is done through a ISR.
                         */
-                       for (ia = TAILQ_FIRST(&in_ifaddrhead); ia;
-                                       ia = TAILQ_NEXT(ia, ia_link)) {
+                       TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
                                /*
                                 * If the addr to forward to is one
                                 * of ours, we pretend to
@@ -616,12 +772,15 @@ sendit:
                                ip_fw_fwd_addr = dst;
                                if (m->m_pkthdr.rcvif == NULL)
                                        m->m_pkthdr.rcvif = ifunit("lo0");
-                               ip->ip_len = htons((u_short)ip->ip_len);
-                               ip->ip_off = htons((u_short)ip->ip_off);
-                               ip->ip_sum = 0;
-                               
-                               ip->ip_sum = in_cksum(m, hlen);
-
+                               if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+                                       m->m_pkthdr.csum_flags |=
+                                           CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+                                       m0->m_pkthdr.csum_data = 0xffff;
+                               }
+                               m->m_pkthdr.csum_flags |=
+                                   CSUM_IP_CHECKED | CSUM_IP_VALID;
+                               HTONS(ip->ip_len);
+                               HTONS(ip->ip_off);
                                ip_input(m);
                                goto done;
                        }
@@ -644,7 +803,7 @@ sendit:
 
                        ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
                        ifp = ro_fwd->ro_rt->rt_ifp;
-                       dl_tag = ro->ro_rt->rt_dlt;
+                       dl_tag = ro_fwd->ro_rt->rt_dlt;
                        ro_fwd->ro_rt->rt_use++;
                        if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
                                dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway;
@@ -653,7 +812,7 @@ sendit:
                                    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
                        else
                                isbroadcast = in_broadcast(dst->sin_addr, ifp);
-                       RTFREE(ro->ro_rt);
+                       rtfree(ro->ro_rt);
                        ro->ro_rt = ro_fwd->ro_rt;
                        dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
 
@@ -675,150 +834,74 @@ sendit:
                 error = EACCES; /* not sure this is the right error msg */
                 goto done;
        }
-#endif /* COMPAT_IPFW */
 
 pass:
+       m->m_pkthdr.csum_flags |= CSUM_IP;
+       sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
 
-#if defined(PM)
-       /*
-        * Processing IP filter/NAT.
-        * Return TRUE  iff this packet is discarded.
-        * Return FALSE iff this packet is accepted.
-        */
-
-       if (doNatFil && pm_out(ro->ro_rt->rt_ifp, ip, m))
-           goto done;
-#endif
-
-#if IPSEC
-       /* get SP for this packet */
-       if (so == NULL)
-               sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
-       else
-               sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
-
-       if (sp == NULL) {
-               ipsecstat.out_inval++;
-               goto bad;
-       }
-
-       error = 0;
-
-       /* check policy */
-       switch (sp->policy) {
-       case IPSEC_POLICY_DISCARD:
+       if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) {
                /*
-                * This packet is just discarded.
+                * Special case code for GMACE
+                * frames that can be checksumed by GMACE SUM16 HW:
+                * frame >64, no fragments, no UDP
                 */
-               ipsecstat.out_polvio++;
-               goto bad;
-
-       case IPSEC_POLICY_BYPASS:
-       case IPSEC_POLICY_NONE:
-               /* no need to do IPsec. */
-               goto skip_ipsec;
-       
-       case IPSEC_POLICY_IPSEC:
-               if (sp->req == NULL) {
-                       /* XXX should be panic ? */
-                       printf("ip_output: No IPsec request specified.\n");
-                       error = EINVAL;
-                       goto bad;
+               if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP)
+                       && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) {
+                       /* Apple GMAC HW, expects STUFF_OFFSET << 16  | START_OFFSET */
+                       u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */
+                       u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF;
+                       m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */
+                       m->m_pkthdr.csum_data = (csumprev + offset)  << 16 ;
+                       m->m_pkthdr.csum_data += offset; 
+                       sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */
                }
-               break;
-
-       case IPSEC_POLICY_ENTRUST:
-       default:
-               printf("ip_output: Invalid policy found. %d\n", sp->policy);
-       }
-
-       ip->ip_len = htons((u_short)ip->ip_len);
-       ip->ip_off = htons((u_short)ip->ip_off);
-       ip->ip_sum = 0;
-
-    {
-       struct ipsec_output_state state;
-       bzero(&state, sizeof(state));
-       state.m = m;
-       if (flags & IP_ROUTETOIF) {
-               state.ro = &iproute;
-               bzero(&iproute, sizeof(iproute));
-       } else
-               state.ro = ro;
-       state.dst = (struct sockaddr *)dst;
-
-       error = ipsec4_output(&state, sp, flags);
-
-       m = state.m;
-       if (flags & IP_ROUTETOIF) {
-               /*
-                * if we have tunnel mode SA, we may need to ignore
-                * IP_ROUTETOIF.
-                */
-               if (state.ro != &iproute || state.ro->ro_rt != NULL) {
-                       flags &= ~IP_ROUTETOIF;
-                       ro = state.ro;
-               }
-       } else
-               ro = state.ro;
-       dst = (struct sockaddr_in *)state.dst;
-       if (error) {
-               /* mbuf is already reclaimed in ipsec4_output. */
-               m0 = NULL;
-               switch (error) {
-               case EHOSTUNREACH:
-               case ENETUNREACH:
-               case EMSGSIZE:
-               case ENOBUFS:
-               case ENOMEM:
-                       break;
-               default:
-                       printf("ip4_output (ipsec): error code %d\n", error);
-                       /*fall through*/
-               case ENOENT:
-                       /* don't show these error codes to the user */
-                       error = 0;
-                       break;
+               else {
+                       /* let the software handle any UDP or TCP checksums */
+                       sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
                }
-               goto bad;
        }
-    }
-
-       /* be sure to update variables that are affected by ipsec4_output() */
-       ip = mtod(m, struct ip *);
-#ifdef _IP_VHL
-       hlen = IP_VHL_HL(ip->ip_vhl) << 2;
-#else
-       hlen = ip->ip_hl << 2;
-#endif
-       if (ro->ro_rt == NULL) {
-               if ((flags & IP_ROUTETOIF) == 0) {
-                       printf("ip_output: "
-                               "can't update route after IPsec processing\n");
-                       error = EHOSTUNREACH;   /*XXX*/
-                       goto bad;
-               }
-       } else {
-               /* nobody uses ia beyond here */
-               ifp = ro->ro_rt->rt_ifp;
+       
+       if (sw_csum & CSUM_DELAY_DATA) {
+               in_delayed_cksum(m);
+               sw_csum &= ~CSUM_DELAY_DATA;
+               m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
        }
-
-       /* make it flipped, again. */
-       ip->ip_len = ntohs((u_short)ip->ip_len);
-       ip->ip_off = ntohs((u_short)ip->ip_off);
-skip_ipsec:
-#endif /*IPSEC*/
+       
+       m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 
        /*
-        * If small enough for interface, can just send directly.
+        * If small enough for interface, or the interface will take
+        * care of the fragmentation for us, can just send directly.
         */
-       if ((u_short)ip->ip_len <= ifp->if_mtu) {
-               ip->ip_len = htons((u_short)ip->ip_len);
-               ip->ip_off = htons((u_short)ip->ip_off);
+       if ((u_short)ip->ip_len <= ifp->if_mtu ||
+           ifp->if_hwassist & CSUM_FRAGMENT) {
+               HTONS(ip->ip_len);
+               HTONS(ip->ip_off);
                ip->ip_sum = 0;
-               ip->ip_sum = in_cksum(m, hlen);
+               if (sw_csum & CSUM_DELAY_IP) {
+                       ip->ip_sum = in_cksum(m, hlen);
+               }
+               
+#ifndef __APPLE__
+               /* Record statistics for this interface address. */
+               if (!(flags & IP_FORWARDING) && ia != NULL) {
+                       ia->ia_ifa.if_opackets++;
+                       ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+               }
+#endif
+
+#if IPSEC
+               /* clean ipsec history once it goes out of the node */
+               if (ipsec_bypass == 0)
+                       ipsec_delaux(m);
+#endif
+#if __APPLE__
                error = dlil_output(dl_tag, m, (void *) ro->ro_rt,
                                    (struct sockaddr *)dst, 0);
+#else
+               error = (*ifp->if_output)(ifp, m,
+                               (struct sockaddr *)dst, ro->ro_rt);
+#endif
                goto done;
        }
        /*
@@ -848,9 +931,23 @@ skip_ipsec:
                goto bad;
        }
 
+       /*
+        * if the interface will not calculate checksums on
+        * fragmented packets, then do it here.
+        */
+       if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
+           (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
+               in_delayed_cksum(m);
+               if (m == NULL)
+                       return(ENOMEM);
+               m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+       }
+
+
     {
        int mhlen, firstlen = len;
        struct mbuf **mnext = &m->m_nextpkt;
+       int nfrags = 1;
 
        /*
         * Loop through length of segment after first fragment,
@@ -865,7 +962,7 @@ skip_ipsec:
                        ipstat.ips_odropped++;
                        goto sendorfree;
                }
-               m->m_flags |= (m0->m_flags & M_MCAST);
+               m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
                m->m_data += max_linkhdr;
                mhip = mtod(m, struct ip *);
                *mhip = *ip;
@@ -891,13 +988,23 @@ skip_ipsec:
                }
                m->m_pkthdr.len = mhlen + len;
                m->m_pkthdr.rcvif = (struct ifnet *)0;
-               mhip->ip_off = htons((u_short)mhip->ip_off);
+               m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
+               HTONS(mhip->ip_off);
                mhip->ip_sum = 0;
-               mhip->ip_sum = in_cksum(m, mhlen);
+               if (sw_csum & CSUM_DELAY_IP) {
+                       mhip->ip_sum = in_cksum(m, mhlen);
+               }
                *mnext = m;
                mnext = &m->m_nextpkt;
-               ipstat.ips_ofragments++;
+               nfrags++;
        }
+       ipstat.ips_ofragments += nfrags;
+
+       /* set first/last markers for fragment chain */
+       m->m_flags |= M_LASTFRAG;
+       m0->m_flags |= M_FIRSTFRAG | M_FRAG;
+       m0->m_pkthdr.csum_data = nfrags;
+
        /*
         * Update first fragment by trimming what's been copied out
         * and updating header, then send each fragment (in order).
@@ -906,10 +1013,12 @@ skip_ipsec:
        m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
        m->m_pkthdr.len = hlen + firstlen;
        ip->ip_len = htons((u_short)m->m_pkthdr.len);
-       ip->ip_off = htons((u_short)(ip->ip_off | IP_MF));
+       ip->ip_off |= IP_MF;
+       HTONS(ip->ip_off);
        ip->ip_sum = 0;
-       ip->ip_sum = in_cksum(m, hlen);
-
+       if (sw_csum & CSUM_DELAY_IP) {
+               ip->ip_sum = in_cksum(m, hlen);
+       }
 sendorfree:
 
        KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 
@@ -918,10 +1027,28 @@ sendorfree:
        for (m = m0; m; m = m0) {
                m0 = m->m_nextpkt;
                m->m_nextpkt = 0;
-               if (error == 0)
+#if IPSEC
+               /* clean ipsec history once it goes out of the node */
+               if (ipsec_bypass == 0)
+                       ipsec_delaux(m);
+#endif
+               if (error == 0) {
+#ifndef __APPLE__
+                       /* Record statistics for this interface address. */
+                       if (ia != NULL) {
+                               ia->ia_ifa.if_opackets++;
+                               ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+                       }
+#endif
+                       
+#if __APPLE__
                    error = dlil_output(dl_tag, m, (void *) ro->ro_rt,
                                        (struct sockaddr *)dst, 0);
-               else
+#else
+                       error = (*ifp->if_output)(ifp, m,
+                           (struct sockaddr *)dst, ro->ro_rt);
+#endif
+               } else
                        m_freem(m);
        }
 
@@ -930,8 +1057,9 @@ sendorfree:
     }
 done:
 #if IPSEC
+       if (ipsec_bypass == 0) {
        if (ro == &iproute && ro->ro_rt) {
-               RTFREE(ro->ro_rt);
+               rtfree(ro->ro_rt);
                ro->ro_rt = NULL;
        }
        if (sp != NULL) {
@@ -939,6 +1067,7 @@ done:
                        printf("DP ip_output call free SP:%x\n", sp));
                key_freesp(sp);
        }
+       }
 #endif /* IPSEC */
 
        KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0);
@@ -948,6 +1077,35 @@ bad:
        goto done;
 }
 
+void
+in_delayed_cksum(struct mbuf *m)
+{
+       struct ip *ip;
+       u_short csum, offset;
+       ip = mtod(m, struct ip *);
+       offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
+       csum = in_cksum_skip(m, ip->ip_len, offset);
+       if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
+               csum = 0xffff;
+       offset += m->m_pkthdr.csum_data & 0xFFFF;        /* checksum offset */
+
+       if (offset > ip->ip_len) /* bogus offset */
+               return;
+
+       if (offset + sizeof(u_short) > m->m_len) {
+               printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
+                   m->m_len, offset, ip->ip_p);
+               /*
+                * XXX
+                * this shouldn't happen, but if it does, the
+                * correct behavior may be to insert the checksum
+                * in the existing chain instead of rearranging it.
+                */
+               m = m_pullup(m, offset + sizeof(u_short));
+       }
+       *(u_short *)(m->m_data + offset) = csum;
+}
+
 /*
  * Insert IP options into preformed packet.
  * Adjust IP destination as required for IP source routing,
@@ -975,6 +1133,7 @@ ip_insertoptions(m, opt, phlen)
                MGETHDR(n, M_DONTWAIT, MT_HEADER);
                if (n == 0)
                        return (m);
+               n->m_pkthdr.rcvif = (struct ifnet *)0;
                n->m_pkthdr.len = m->m_pkthdr.len + optlen;
                m->m_len -= sizeof(struct ip);
                m->m_data += sizeof(struct ip);
@@ -1001,9 +1160,6 @@ ip_insertoptions(m, opt, phlen)
  * Copy options from ip to jp,
  * omitting those not copied during fragmentation.
  */
-#if !IPFILTER && !IPFILTER_LKM
-static
-#endif
 int
 ip_optcopy(ip, jp)
        struct ip *ip, *jp;
@@ -1023,8 +1179,16 @@ ip_optcopy(ip, jp)
                        *dp++ = IPOPT_NOP;
                        optlen = 1;
                        continue;
-               } else
-                       optlen = cp[IPOPT_OLEN];
+               }
+#if DIAGNOSTIC
+               if (cnt < IPOPT_OLEN + sizeof(*cp))
+                       panic("malformed IPv4 option passed to ip_optcopy");
+#endif
+               optlen = cp[IPOPT_OLEN];
+#if DIAGNOSTIC
+               if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
+                       panic("malformed IPv4 option passed to ip_optcopy");
+#endif
                /* bogus lengths should have been caught by ip_dooptions */
                if (optlen > cnt)
                        optlen = cnt;
@@ -1088,7 +1252,9 @@ ip_ctloutput(so, sopt)
                case IP_RECVRETOPTS:
                case IP_RECVDSTADDR:
                case IP_RECVIF:
+#if defined(NFAITH) && NFAITH > 0
                case IP_FAITH:
+#endif
                        error = sooptcopyin(sopt, &optval, sizeof optval,
                                            sizeof optval);
                        if (error)
@@ -1124,9 +1290,11 @@ ip_ctloutput(so, sopt)
                                OPTSET(INP_RECVIF);
                                break;
 
+#if defined(NFAITH) && NFAITH > 0
                        case IP_FAITH:
                                OPTSET(INP_FAITH);
                                break;
+#endif
                        }
                        break;
 #undef OPTSET
@@ -1177,9 +1345,9 @@ ip_ctloutput(so, sopt)
                        struct mbuf *m;
                        int optname;
 
-                       if (error = sooptgetm(sopt, &m)) /* XXX */
+                       if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
                                break;
-                       if (error = sooptmcopyin(sopt, m)) /* XXX */
+                       if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
                                break;
                        priv = (sopt->sopt_p != NULL &&
                                suser(sopt->sopt_p->p_ucred,
@@ -1221,7 +1389,9 @@ ip_ctloutput(so, sopt)
                case IP_RECVDSTADDR:
                case IP_RECVIF:
                case IP_PORTRANGE:
+#if defined(NFAITH) && NFAITH > 0
                case IP_FAITH:
+#endif
                        switch (sopt->sopt_name) {
 
                        case IP_TOS:
@@ -1259,9 +1429,11 @@ ip_ctloutput(so, sopt)
                                        optval = 0;
                                break;
 
+#if defined(NFAITH) && NFAITH > 0
                        case IP_FAITH:
                                optval = OPTBIT(INP_FAITH);
                                break;
+#endif
                        }
                        error = sooptcopyout(sopt, &optval, sizeof optval);
                        break;
@@ -1279,23 +1451,16 @@ ip_ctloutput(so, sopt)
                case IP_IPSEC_POLICY:
                {
                        struct mbuf *m = NULL;
-                       size_t len = 0;
                        caddr_t req = NULL;
+                       size_t len = 0;
 
-                       if (error = sooptgetm(sopt, &m)) /* XXX */
-                               break;
-                       if (error = sooptmcopyin(sopt, m)) /* XXX */
-                               break;
-                       if (m) {
+                       if (m != 0) {
                                req = mtod(m, caddr_t);
                                len = m->m_len;
                        }
-
                        error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
                        if (error == 0)
-                               error = sooptmcopyout(sopt, m); /* XXX */
-
-                       /* if error, m_freem called at soopt_mcopyout(). */
+                               error = soopt_mcopyout(sopt, m); /* XXX */
                        if (error == 0)
                                m_freem(m);
                        break;
@@ -1423,6 +1588,33 @@ bad:
  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
  * standard option (IP_TTL).
  */
+
+/*
+ * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
+ */
+static struct ifnet *
+ip_multicast_if(a, ifindexp)
+       struct in_addr *a;
+       int *ifindexp;
+{
+       int ifindex;
+       struct ifnet *ifp;
+
+       if (ifindexp)
+               *ifindexp = 0;
+       if (ntohl(a->s_addr) >> 24 == 0) {
+               ifindex = ntohl(a->s_addr) & 0xffffff;
+               if (ifindex < 0 || if_index < ifindex)
+                       return NULL;
+               ifp = ifindex2ifnet[ifindex];
+               if (ifindexp)
+                       *ifindexp = ifindex;
+       } else {
+               INADDR_TO_IFP(*a, ifp);
+       }
+       return ifp;
+}
+
 /*
  * Set the IP multicast options in response to user setsockopt().
  */
@@ -1435,10 +1627,11 @@ ip_setmoptions(sopt, imop)
        int i;
        struct in_addr addr;
        struct ip_mreq mreq;
-       struct ifnet *ifp;
+       struct ifnet *ifp = NULL;
        struct ip_moptions *imo = *imop;
        struct route ro;
        struct sockaddr_in *dst;
+       int ifindex;
        int s;
 
        if (imo == NULL) {
@@ -1453,6 +1646,7 @@ ip_setmoptions(sopt, imop)
                        return (ENOBUFS);
                *imop = imo;
                imo->imo_multicast_ifp = NULL;
+               imo->imo_multicast_addr.s_addr = INADDR_ANY;
                imo->imo_multicast_vif = -1;
                imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
                imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
@@ -1498,13 +1692,17 @@ ip_setmoptions(sopt, imop)
                 * it supports multicasting.
                 */
                s = splimp();
-               INADDR_TO_IFP(addr, ifp);
+               ifp = ip_multicast_if(&addr, &ifindex);
                if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
                        splx(s);
                        error = EADDRNOTAVAIL;
                        break;
                }
                imo->imo_multicast_ifp = ifp;
+               if (ifindex)
+                       imo->imo_multicast_addr = addr;
+               else
+                       imo->imo_multicast_addr.s_addr = INADDR_ANY;
                splx(s);
                break;
 
@@ -1582,16 +1780,18 @@ ip_setmoptions(sopt, imop)
                        dst->sin_family = AF_INET;
                        dst->sin_addr = mreq.imr_multiaddr;
                        rtalloc(&ro);
-                       if (ro.ro_rt == NULL) {
-                               error = EADDRNOTAVAIL;
-                               splx(s);
-                               break;
+                       if (ro.ro_rt != NULL) {
+                               ifp = ro.ro_rt->rt_ifp;
+                               rtfree(ro.ro_rt);
+                       }
+                       else {
+                               /* If there's no default route, try using loopback */
+                               mreq.imr_interface.s_addr = INADDR_LOOPBACK;
                        }
-                       ifp = ro.ro_rt->rt_ifp;
-                       rtfree(ro.ro_rt);
                }
-               else {
-                       INADDR_TO_IFP(mreq.imr_interface, ifp);
+               
+               if (ifp == NULL) {
+                       ifp = ip_multicast_if(&mreq.imr_interface, NULL);
                }
 
                /*
@@ -1659,7 +1859,7 @@ ip_setmoptions(sopt, imop)
                if (mreq.imr_interface.s_addr == INADDR_ANY)
                        ifp = NULL;
                else {
-                       INADDR_TO_IFP(mreq.imr_interface, ifp);
+                       ifp = ip_multicast_if(&mreq.imr_interface, NULL);
                        if (ifp == NULL) {
                                error = EADDRNOTAVAIL;
                                splx(s);
@@ -1741,7 +1941,10 @@ ip_getmoptions(sopt, imo)
        case IP_MULTICAST_IF:
                if (imo == NULL || imo->imo_multicast_ifp == NULL)
                        addr.s_addr = INADDR_ANY;
-               else {
+               else if (imo->imo_multicast_addr.s_addr) {
+                       /* return the value user has set */
+                       addr = imo->imo_multicast_addr;
+               } else {
                        IFP_TO_IA(imo->imo_multicast_ifp, ia);
                        addr.s_addr = (ia == NULL) ? INADDR_ANY
                                : IA_SIN(ia)->sin_addr.s_addr;
@@ -1789,7 +1992,8 @@ ip_freemoptions(imo)
 
        if (imo != NULL) {
                for (i = 0; i < imo->imo_num_memberships; ++i)
-                       in_delmulti(imo->imo_membership[i]);
+                       if (imo->imo_membership[i] != NULL)
+                               in_delmulti(imo->imo_membership[i]);
                FREE(imo, M_IPMOPTS);
        }
 }
@@ -1820,11 +2024,10 @@ ip_mloopback(ifp, m, dst, hlen)
                 * than the interface's MTU.  Can this possibly matter?
                 */
                ip = mtod(copym, struct ip *);
-               ip->ip_len = htons((u_short)ip->ip_len);
-               ip->ip_off = htons((u_short)ip->ip_off);
+               HTONS(ip->ip_len);
+               HTONS(ip->ip_off);
                ip->ip_sum = 0;
                ip->ip_sum = in_cksum(copym, hlen);
-
                /*
                 * NB:
                 * It's not clear whether there are any lingering
@@ -1844,6 +2047,30 @@ ip_mloopback(ifp, m, dst, hlen)
                }
 #endif
 
+
+        /*
+        * Mark checksum as valid or calculate checksum for loopback.
+        * 
+        * This is done this way because we have to embed the ifp of
+        * the interface we will send the original copy of the packet
+        * out on in the mbuf. ip_input will check if_hwassist of the
+        * embedded ifp and ignore all csum_flags if if_hwassist is 0.
+        * The UDP checksum has not been calculated yet.
+        */
+        if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+            if (ifp->if_hwassist) {
+                copym->m_pkthdr.csum_flags |=
+                    CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
+                    CSUM_IP_CHECKED | CSUM_IP_VALID;
+                copym->m_pkthdr.csum_data = 0xffff;
+            } else {
+               NTOHS(ip->ip_len);
+                in_delayed_cksum(copym);
+               HTONS(ip->ip_len);
+           }
+        }
+
+
                /*
                 * TedW: 
                 * We need to send all loopback traffic down to dlil in case 
@@ -1859,8 +2086,8 @@ ip_mloopback(ifp, m, dst, hlen)
                 *  to make the loopback driver compliant with the data link
                 *  requirements.
                 */
-               if (lo_dl_tag)
-               {   copym->m_pkthdr.rcvif = ifp;
+               if (lo_dl_tag) {
+                       copym->m_pkthdr.rcvif = ifp;
                    dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *) dst, 0);
                } else {
                    printf("Warning: ip_output call to dlil_find_dltag failed!\n");