bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/kernel.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <kern/locks.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/mcache.h>
  83
  84 #include <machine/endian.h>
  85 #include <pexpert/pexpert.h>
  86
  87 #include <net/if.h>
  88 #include <net/if_dl.h>
  89 #include <net/if_types.h>
  90 #include <net/route.h>
  91 #include <net/ntstat.h>
  92 #include <net/net_osdep.h>
  93
  94 #include <netinet/in.h>
  95 #include <netinet/in_systm.h>
  96 #include <netinet/ip.h>
  97 #include <netinet/in_pcb.h>
  98 #include <netinet/in_var.h>
  99 #include <netinet/ip_var.h>
 100
 101 #include <netinet/kpi_ipfilter_var.h>
 102
 103 #if CONFIG_MACF_NET
 104 #include <security/mac_framework.h>
 105 #endif
 106
 107 #include "faith.h"
 108
 109 #include <net/dlil.h>
 110 #include <sys/kdebug.h>
 111 #include <libkern/OSAtomic.h>
 112
 113 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 114 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 115 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 116 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 117
 118 #define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
 119
 120 #if IPSEC
 121 #include <netinet6/ipsec.h>
 122 #include <netkey/key.h>
 123 #if IPSEC_DEBUG
 124 #include <netkey/key_debug.h>
 125 #else
 126 #define KEYDEBUG(lev,arg)
 127 #endif
 128 #endif /*IPSEC*/
 129
 130 #include <netinet/ip_fw.h>
 131 #include <netinet/ip_divert.h>
 132 #include <mach/sdt.h>
 133
 134 #if DUMMYNET
 135 #include <netinet/ip_dummynet.h>
 136 #endif
 137
 138 #if PF
 139 #include <net/pfvar.h>
 140 #endif /* PF */
 141
 142 #if IPFIREWALL_FORWARD_DEBUG
 143 #define print_ip(a)      printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
 144                                                   (ntohl(a.s_addr)>>16)&0xFF,\
 145                                                   (ntohl(a.s_addr)>>8)&0xFF,\
 146                                                   (ntohl(a.s_addr))&0xFF);
 147 #endif
 148
 149
 150 u_short ip_id;
 151
 152 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 153 static void     ip_mloopback(struct ifnet *, struct mbuf *,
 154         struct sockaddr_in *, int);
 155 static int      ip_pcbopts(int, struct mbuf **, struct mbuf *);
 156 static void     imo_trace(struct ip_moptions *, int);
 157
 158 static void ip_out_cksum_stats(int, u_int32_t);
 159 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 160
 161 int     ip_optcopy(struct ip *, struct ip *);
 162 void in_delayed_cksum_offset(struct mbuf *, int );
 163 void in_cksum_offset(struct mbuf* , size_t );
 164
 165 extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **);
 166
 167 extern  struct protosw inetsw[];
 168
 169 extern struct ip_linklocal_stat ip_linklocal_stat;
 170 extern lck_mtx_t *ip_mutex;
 171
 172 /* temporary: for testing */
 173 #if IPSEC
 174 extern int ipsec_bypass;
 175 #endif
 176
 177 static int      ip_maxchainsent = 0;
 178 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED,
 179     &ip_maxchainsent, 0, "use dlil_output_list");
 180 #if DEBUG
 181 static int forge_ce = 0;
 182 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED,
 183     &forge_ce, 0, "Forge ECN CE");
 184 #endif /* DEBUG */
 185
 186 static int ip_select_srcif_debug = 0;
 187 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 188     &ip_select_srcif_debug, 0, "log source interface selection debug info");
 189
 190 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 191
 192 /* For gdb */
 193 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
 194
 195 struct ip_moptions_dbg {
 196         struct ip_moptions      imo;                    /* ip_moptions */
 197         u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
 198         u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
 199         /*
 200          * Alloc and free callers.
 201          */
 202         ctrace_t                imo_alloc;
 203         ctrace_t                imo_free;
 204         /*
 205          * Circular lists of IMO_ADDREF and IMO_REMREF callers.
 206          */
 207         ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
 208         ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
 209 };
 210
 211 #if DEBUG
 212 static unsigned int imo_debug = 1;      /* debugging (enabled) */
 213 #else
 214 static unsigned int imo_debug;          /* debugging (disabled) */
 215 #endif /* !DEBUG */
 216 static unsigned int imo_size;           /* size of zone element */
 217 static struct zone *imo_zone;           /* zone for ip_moptions */
 218
 219 #define IMO_ZONE_MAX            64              /* maximum elements in zone */
 220 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
 221
 222 /*
 223  * IP output.  The packet in mbuf chain m contains a skeletal IP
 224  * header (with len, off, ttl, proto, tos, src, dst).
 225  * The mbuf chain containing the packet will be freed.
 226  * The mbuf opt, if present, will not be freed.
 227  */
 228 int
 229 ip_output(
 230         struct mbuf *m0,
 231         struct mbuf *opt,
 232         struct route *ro,
 233         int flags,
 234         struct ip_moptions *imo,
 235         struct ip_out_args *ipoa)
 236 {
 237         int error;
 238         error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
 239         return error;
 240 }
 241
 242 /*
 243  * Returns:     0                       Success
 244  *              ENOMEM
 245  *              EADDRNOTAVAIL
 246  *              ENETUNREACH
 247  *              EHOSTUNREACH
 248  *              EACCES
 249  *              EMSGSIZE
 250  *              ENOBUFS
 251  *      ipsec4_getpolicybyaddr:???      [IPSEC 4th argument, contents modified]
 252  *      ipsec4_getpolicybysock:???      [IPSEC 4th argument, contents modified]
 253  *      key_spdacquire:???              [IPSEC]
 254  *      ipsec4_output:???               [IPSEC]
 255  *      <fr_checkp>:???                 [firewall]
 256  *      ip_dn_io_ptr:???                [dummynet]
 257  *      dlil_output:???                 [DLIL]
 258  *      dlil_output_list:???            [DLIL]
 259  *
 260  * Notes:       The ipsec4_getpolicyby{addr|sock} function error returns are
 261  *              only used as the error return from this function where one of
 262  *              these functions fails to return a policy.
 263  */
 264 int
 265 ip_output_list(
 266         struct mbuf *m0,
 267         int packetchain,
 268         struct mbuf *opt,
 269         struct route *ro,
 270         int flags,
 271         struct ip_moptions *imo,
 272         struct ip_out_args *ipoa
 273         )
 274 {
 275         struct ip *ip;
 276         struct ifnet *ifp = NULL;
 277         struct mbuf *m = m0, **mppn = NULL;
 278         int hlen = sizeof (struct ip);
 279         int len = 0, error = 0;
 280         struct sockaddr_in *dst = NULL;
 281         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 282         int isbroadcast, sw_csum;
 283         struct in_addr pkt_dst;
 284         struct ipf_pktopts *ippo = NULL, ipf_pktopts;
 285 #if IPSEC
 286         struct route iproute;
 287         struct socket *so = NULL;
 288         struct secpolicy *sp = NULL;
 289 #endif
 290 #if IPFIREWALL_FORWARD
 291         int fwd_rewrite_src = 0;
 292 #endif
 293 #if IPFIREWALL
 294         int off;
 295         struct ip_fw_args args;
 296         struct m_tag    *tag;
 297         struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
 298 #endif
 299         int didfilter = 0;
 300         ipfilter_t inject_filter_ref = 0;
 301 #if DUMMYNET
 302         struct route    saved_route;
 303         struct ip_out_args saved_ipoa;
 304         struct sockaddr_in dst_buf;
 305 #endif /* DUMMYNET */
 306         struct mbuf * packetlist;
 307         int pktcnt = 0, tso = 0;
 308         u_int32_t       bytecnt = 0;
 309         unsigned int ifscope;
 310         unsigned int nocell;
 311         boolean_t select_srcif;
 312         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 313
 314         packetlist = m0;
 315 #if IPFIREWALL
 316         args.next_hop = NULL;
 317         args.eh = NULL;
 318         args.rule = NULL;
 319         args.divert_rule = 0;                   /* divert cookie */
 320         args.ipoa = NULL;
 321
 322         if (SLIST_EMPTY(&m0->m_pkthdr.tags))
 323                 goto ipfw_tags_done;
 324
 325         /* Grab info from mtags prepended to the chain */
 326 #if DUMMYNET
 327         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 328             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 329                 struct dn_pkt_tag       *dn_tag;
 330
 331                 dn_tag = (struct dn_pkt_tag *)(tag+1);
 332                 args.rule = dn_tag->rule;
 333                 opt = NULL;
 334                 saved_route = dn_tag->ro;
 335                 ro = &saved_route;
 336
 337                 imo = NULL;
 338                 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
 339                 dst = &dst_buf;
 340                 ifp = dn_tag->ifp;
 341                 flags = dn_tag->flags;
 342                 saved_ipoa = dn_tag->ipoa;
 343                 ipoa = &saved_ipoa;
 344
 345                 m_tag_delete(m0, tag);
 346         }
 347 #endif /* DUMMYNET */
 348
 349 #if IPDIVERT
 350         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 351             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
 352                 struct divert_tag       *div_tag;
 353
 354                 div_tag = (struct divert_tag *)(tag+1);
 355                 args.divert_rule = div_tag->cookie;
 356
 357                 m_tag_delete(m0, tag);
 358         }
 359 #endif /* IPDIVERT */
 360
 361         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 362             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
 363                 struct ip_fwd_tag       *ipfwd_tag;
 364
 365                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
 366                 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
 367
 368                 m_tag_delete(m0, tag);
 369         }
 370 ipfw_tags_done:
 371 #endif /* IPFIREWALL */
 372
 373         m = m0;
 374
 375 #if     DIAGNOSTIC
 376         if ( !m || (m->m_flags & M_PKTHDR) != 0)
 377                 panic("ip_output no HDR");
 378         if (!ro)
 379                 panic("ip_output no route, proto = %d",
 380                       mtod(m, struct ip *)->ip_p);
 381 #endif
 382
 383         bzero(&ipf_pktopts, sizeof(struct ipf_pktopts));
 384         ippo = &ipf_pktopts;
 385
 386         /*
 387          * At present the IP_OUTARGS flag implies a request for IP to
 388          * perform source interface selection.  In the forwarding case,
 389          * only the ifscope value is used, as source interface selection
 390          * doesn't take place.
 391          */
 392         if (ip_doscopedroute && (flags & IP_OUTARGS)) {
 393                 select_srcif = !(flags & IP_FORWARDING);
 394                 ifscope = ipoa->ipoa_boundif;
 395                 ipf_pktopts.ippo_flags = IPPOF_BOUND_IF;
 396                 ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE);
 397         } else {
 398                 select_srcif = FALSE;
 399                 ifscope = IFSCOPE_NONE;
 400         }
 401
 402         if (flags & IP_OUTARGS) {
 403                 nocell = ipoa->ipoa_nocell;
 404                 if (nocell)
 405                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
 406         } else {
 407                 nocell = 0;
 408         }
 409
 410 #if IPFIREWALL
 411         if (args.rule != NULL) {        /* dummynet already saw us */
 412                 ip = mtod(m, struct ip *);
 413                 hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
 414                 if (ro->ro_rt != NULL) {
 415                         RT_LOCK_SPIN(ro->ro_rt);
 416                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 417                         if (ia) {
 418                                 /* Become a regular mutex */
 419                                 RT_CONVERT_LOCK(ro->ro_rt);
 420                                 IFA_ADDREF(&ia->ia_ifa);
 421                         }
 422                         RT_UNLOCK(ro->ro_rt);
 423                 }
 424 #if IPSEC
 425                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 426                         so = ipsec_getsocket(m);
 427                         (void)ipsec_setsocket(m, NULL);
 428                 }
 429 #endif
 430                 goto sendit;
 431         }
 432 #endif /* IPFIREWALL */
 433
 434 #if IPSEC
 435         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 436                 so = ipsec_getsocket(m);
 437                 (void)ipsec_setsocket(m, NULL);
 438         }
 439 #endif
 440 loopit:
 441         /*
 442          * No need to proccess packet twice if we've
 443          * already seen it
 444          */
 445         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
 446                 inject_filter_ref = ipf_get_inject_filter(m);
 447         else
 448                 inject_filter_ref = 0;
 449
 450         if (opt) {
 451                 m = ip_insertoptions(m, opt, &len);
 452                 hlen = len;
 453         }
 454         ip = mtod(m, struct ip *);
 455 #if IPFIREWALL
 456         /*
 457          * rdar://8542331
 458          *
 459          * When dealing with a packet chain, we need to reset "next_hop" because
 460          * "dst" may have been changed to the gateway address below for the previous
 461          * packet of the chain. This could cause the route to be inavertandly changed
 462          * to the route to the gateway address (instead of the route to the destination).
 463          */
 464         args.next_hop = next_hop_from_ipfwd_tag;
 465         pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
 466 #else
 467         pkt_dst = ip->ip_dst;
 468 #endif
 469
 470         /*
 471          * We must not send if the packet is destined to network zero.
 472          * RFC1122 3.2.1.3 (a) and (b).
 473          */
 474         if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
 475                 error = EHOSTUNREACH;
 476                 goto bad;
 477         }
 478
 479         /*
 480          * Fill in IP header.
 481          */
 482         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 483                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 484                 ip->ip_off &= IP_DF;
 485 #if RANDOM_IP_ID
 486                 ip->ip_id = ip_randomid();
 487 #else
 488                 ip->ip_id = htons(ip_id++);
 489 #endif
 490                 OSAddAtomic(1, &ipstat.ips_localout);
 491         } else {
 492                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 493         }
 494
 495 #if DEBUG
 496         /* For debugging, we let the stack forge congestion */
 497         if (forge_ce != 0 &&
 498                 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 499                  (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 500                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 501                 forge_ce--;
 502         }
 503 #endif /* DEBUG */
 504
 505         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr,
 506                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
 507
 508         dst = (struct sockaddr_in *)&ro->ro_dst;
 509
 510         /*
 511          * If there is a cached route,
 512          * check that it is to the same destination
 513          * and is still up.  If not, free it and try again.
 514          * The address family should also be checked in case of sharing the
 515          * cache with IPv6.
 516          */
 517
 518         if (ro->ro_rt != NULL) {
 519                 if (ro->ro_rt->generation_id != route_generation &&
 520                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) &&
 521                     (ip->ip_src.s_addr != INADDR_ANY)) {
 522                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 523                         if (src_ia == NULL) {
 524                                 error = EADDRNOTAVAIL;
 525                                 goto bad;
 526                         }
 527                         IFA_REMREF(&src_ia->ia_ifa);
 528                 }
 529                 /*
 530                  * Test rt_flags without holding rt_lock for performance
 531                  * reasons; if the route is down it will hopefully be
 532                  * caught by the layer below (since it uses this route
 533                  * as a hint) or during the next transmit.
 534                  */
 535                 if ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 536                     dst->sin_family != AF_INET ||
 537                     dst->sin_addr.s_addr != pkt_dst.s_addr) {
 538                         rtfree(ro->ro_rt);
 539                         ro->ro_rt = NULL;
 540                 }
 541                 /*
 542                  * If we're doing source interface selection, we may not
 543                  * want to use this route; only synch up the generation
 544                  * count otherwise.
 545                  */
 546                 if (!select_srcif && ro->ro_rt != NULL &&
 547                     ro->ro_rt->generation_id != route_generation)
 548                         ro->ro_rt->generation_id = route_generation;
 549         }
 550         if (ro->ro_rt == NULL) {
 551                 bzero(dst, sizeof(*dst));
 552                 dst->sin_family = AF_INET;
 553                 dst->sin_len = sizeof(*dst);
 554                 dst->sin_addr = pkt_dst;
 555         }
 556         /*
 557          * If routing to interface only,
 558          * short circuit routing lookup.
 559          */
 560 #define ifatoia(ifa)    ((struct in_ifaddr *)(ifa))
 561 #define sintosa(sin)    ((struct sockaddr *)(sin))
 562         if (flags & IP_ROUTETOIF) {
 563                 if (ia)
 564                         IFA_REMREF(&ia->ia_ifa);
 565                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) {
 566                         if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
 567                                 OSAddAtomic(1, &ipstat.ips_noroute);
 568                                 error = ENETUNREACH;
 569                                 goto bad;
 570                         }
 571                 }
 572                 ifp = ia->ia_ifp;
 573                 ip->ip_ttl = 1;
 574                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
 575         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 576             imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
 577                 /*
 578                  * Bypass the normal routing lookup for multicast
 579                  * packets if the interface is specified.
 580                  */
 581                 isbroadcast = 0;
 582                 if (ia != NULL)
 583                         IFA_REMREF(&ia->ia_ifa);
 584
 585                 /* Macro takes reference on ia */
 586                 IFP_TO_IA(ifp, ia);
 587         } else {
 588                 boolean_t cloneok = FALSE;
 589                 /*
 590                  * Perform source interface selection; the source IP address
 591                  * must belong to one of the addresses of the interface used
 592                  * by the route.  For performance reasons, do this only if
 593                  * there is no route, or if the routing table has changed,
 594                  * or if we haven't done source interface selection on this
 595                  * route (for this PCB instance) before.
 596                  */
 597                 if (select_srcif && ip->ip_src.s_addr != INADDR_ANY &&
 598                     (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) ||
 599                     ro->ro_rt->generation_id != route_generation ||
 600                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 601                         struct ifaddr *ifa;
 602
 603                         /* Find the source interface */
 604                         ifa = in_selectsrcif(ip, ro, ifscope);
 605
 606                         /*
 607                          * If the source address belongs to a cellular interface
 608                          * and the caller forbids our using interfaces of such
 609                          * type, pretend that there is no source address.
 610                          */
 611                         if (nocell && ifa != NULL &&
 612                             ifa->ifa_ifp->if_type == IFT_CELLULAR) {
 613                                 IFA_REMREF(ifa);
 614                                 error = EADDRNOTAVAIL;
 615                                 goto bad;
 616                         }
 617
 618                         /*
 619                          * If the source address is spoofed (in the case
 620                          * of IP_RAWOUTPUT), or if this is destined for
 621                          * local/loopback, just let it go out using the
 622                          * interface of the route.  Otherwise, there's no
 623                          * interface having such an address, so bail out.
 624                          */
 625                         if (ifa == NULL && !(flags & IP_RAWOUTPUT) &&
 626                             ifscope != lo_ifp->if_index) {
 627                                 error = EADDRNOTAVAIL;
 628                                 goto bad;
 629                         }
 630
 631                         /*
 632                          * If the caller didn't explicitly specify the scope,
 633                          * pick it up from the source interface.  If the cached
 634                          * route was wrong and was blown away as part of source
 635                          * interface selection, don't mask out RTF_PRCLONING
 636                          * since that route may have been allocated by the ULP,
 637                          * unless the IP header was created by the caller or
 638                          * the destination is IPv4 LLA.  The check for the
 639                          * latter is needed because IPv4 LLAs are never scoped
 640                          * in the current implementation, and we don't want to
 641                          * replace the resolved IPv4 LLA route with one whose
 642                          * gateway points to that of the default gateway on
 643                          * the primary interface of the system.
 644                          */
 645                         if (ifa != NULL) {
 646                                 if (ifscope == IFSCOPE_NONE)
 647                                         ifscope = ifa->ifa_ifp->if_index;
 648                                 IFA_REMREF(ifa);
 649                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 650                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 651                         }
 652                 }
 653
 654                 /*
 655                  * If this is the case, we probably don't want to allocate
 656                  * a protocol-cloned route since we didn't get one from the
 657                  * ULP.  This lets TCP do its thing, while not burdening
 658                  * forwarding or ICMP with the overhead of cloning a route.
 659                  * Of course, we still want to do any cloning requested by
 660                  * the link layer, as this is probably required in all cases
 661                  * for correct operation (as it is for ARP).
 662                  */
 663                 if (ro->ro_rt == NULL) {
 664                         unsigned long ign = RTF_PRCLONING;
 665                         /*
 666                          * We make an exception here: if the destination
 667                          * address is INADDR_BROADCAST, allocate a protocol-
 668                          * cloned host route so that we end up with a route
 669                          * marked with the RTF_BROADCAST flag.  Otherwise,
 670                          * we would end up referring to the default route,
 671                          * instead of creating a cloned host route entry.
 672                          * That would introduce inconsistencies between ULPs
 673                          * that allocate a route and those that don't.  The
 674                          * RTF_BROADCAST route is important since we'd want
 675                          * to send out undirected IP broadcast packets using
 676                          * link-level broadcast address. Another exception
 677                          * is for ULP-created routes that got blown away by
 678                          * source interface selection (see above).
 679                          *
 680                          * These exceptions will no longer be necessary when
 681                          * the RTF_PRCLONING scheme is no longer present.
 682                          */
 683                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
 684                                 ign &= ~RTF_PRCLONING;
 685
 686                         /*
 687                          * Loosen the route lookup criteria if the ifscope
 688                          * corresponds to the loopback interface; this is
 689                          * needed to support Application Layer Gateways
 690                          * listening on loopback, in conjunction with packet
 691                          * filter redirection rules.  The final source IP
 692                          * address will be rewritten by the packet filter
 693                          * prior to the RFC1122 loopback check below.
 694                          */
 695                         if (ifscope == lo_ifp->if_index)
 696                                 rtalloc_ign(ro, ign);
 697                         else
 698                                 rtalloc_scoped_ign(ro, ign, ifscope);
 699
 700                         /*
 701                          * If the route points to a cellular interface and the
 702                          * caller forbids our using interfaces of such type,
 703                          * pretend that there is no route.
 704                          */
 705                         if (nocell && ro->ro_rt != NULL) {
 706                                 RT_LOCK_SPIN(ro->ro_rt);
 707                                 if (ro->ro_rt->rt_ifp->if_type ==
 708                                     IFT_CELLULAR) {
 709                                         RT_UNLOCK(ro->ro_rt);
 710                                         rtfree(ro->ro_rt);
 711                                         ro->ro_rt = NULL;
 712                                 } else {
 713                                         RT_UNLOCK(ro->ro_rt);
 714                                 }
 715                         }
 716                 }
 717
 718                 if (ro->ro_rt == NULL) {
 719                         OSAddAtomic(1, &ipstat.ips_noroute);
 720                         error = EHOSTUNREACH;
 721                         goto bad;
 722                 }
 723
 724                 if (ia)
 725                         IFA_REMREF(&ia->ia_ifa);
 726                 RT_LOCK_SPIN(ro->ro_rt);
 727                 ia = ifatoia(ro->ro_rt->rt_ifa);
 728                 if (ia) {
 729                         /* Become a regular mutex */
 730                         RT_CONVERT_LOCK(ro->ro_rt);
 731                         IFA_ADDREF(&ia->ia_ifa);
 732                 }
 733                 ifp = ro->ro_rt->rt_ifp;
 734                 ro->ro_rt->rt_use++;
 735                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 736                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 737                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 738                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 739                 } else {
 740                         /* Become a regular mutex */
 741                         RT_CONVERT_LOCK(ro->ro_rt);
 742                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
 743                 }
 744                 RT_UNLOCK(ro->ro_rt);
 745         }
 746
 747         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 748                 struct in_multi *inm;
 749                 u_int32_t vif;
 750                 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
 751                 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 752
 753                 m->m_flags |= M_MCAST;
 754                 /*
 755                  * IP destination address is multicast.  Make sure "dst"
 756                  * still points to the address in "ro".  (It may have been
 757                  * changed to point to a gateway address, above.)
 758                  */
 759                 dst = (struct sockaddr_in *)&ro->ro_dst;
 760                 /*
 761                  * See if the caller provided any multicast options
 762                  */
 763                 if (imo != NULL) {
 764                         IMO_LOCK(imo);
 765                         vif = imo->imo_multicast_vif;
 766                         ttl = imo->imo_multicast_ttl;
 767                         loop = imo->imo_multicast_loop;
 768                         if ((flags & IP_RAWOUTPUT) == 0)
 769                                 ip->ip_ttl = ttl;
 770                         if (imo->imo_multicast_ifp != NULL)
 771                                 ifp = imo->imo_multicast_ifp;
 772                         IMO_UNLOCK(imo);
 773 #if MROUTING
 774                         if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 ||
 775                             ip->ip_src.s_addr == INADDR_ANY))
 776                                 ip->ip_src.s_addr = ip_mcast_src(vif);
 777 #endif /* MROUTING */
 778                 } else if ((flags & IP_RAWOUTPUT) == 0) {
 779                         vif = -1;
 780                         ip->ip_ttl = ttl;
 781                 }
 782                 /*
 783                  * Confirm that the outgoing interface supports multicast.
 784                  */
 785                 if (imo == NULL || vif == -1) {
 786                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 787                                 OSAddAtomic(1, &ipstat.ips_noroute);
 788                                 error = ENETUNREACH;
 789                                 goto bad;
 790                         }
 791                 }
 792                 /*
 793                  * If source address not specified yet, use address
 794                  * of outgoing interface.
 795                  */
 796                 if (ip->ip_src.s_addr == INADDR_ANY) {
 797                         struct in_ifaddr *ia1;
 798                         lck_rw_lock_shared(in_ifaddr_rwlock);
 799                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
 800                                 IFA_LOCK_SPIN(&ia1->ia_ifa);
 801                                 if (ia1->ia_ifp == ifp) {
 802                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 803                                         IFA_UNLOCK(&ia1->ia_ifa);
 804                                         break;
 805                                 }
 806                                 IFA_UNLOCK(&ia1->ia_ifa);
 807                         }
 808                         lck_rw_done(in_ifaddr_rwlock);
 809                         if (ip->ip_src.s_addr == INADDR_ANY) {
 810                                 error = ENETUNREACH;
 811                                 goto bad;
 812                         }
 813                 }
 814
 815                 in_multihead_lock_shared();
 816                 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
 817                 in_multihead_lock_done();
 818                 if (inm != NULL && (imo == NULL || loop)) {
 819                         /*
 820                          * If we belong to the destination multicast group
 821                          * on the outgoing interface, and the caller did not
 822                          * forbid loopback, loop back a copy.
 823                          */
 824                         if (!TAILQ_EMPTY(&ipv4_filters)) {
 825                                 struct ipfilter *filter;
 826                                 int seen = (inject_filter_ref == 0);
 827
 828                                 if (imo != NULL) {
 829                                         ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS;
 830                                         ipf_pktopts.ippo_mcast_ifnet = ifp;
 831                                         ipf_pktopts.ippo_mcast_ttl = ttl;
 832                                         ipf_pktopts.ippo_mcast_loop = loop;
 833                                 }
 834
 835                                 ipf_ref();
 836
 837                                 /* 4135317 - always pass network byte order to filter */
 838
 839 #if BYTE_ORDER != BIG_ENDIAN
 840                                 HTONS(ip->ip_len);
 841                                 HTONS(ip->ip_off);
 842 #endif
 843
 844                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 845                                         if (seen == 0) {
 846                                                 if ((struct ipfilter *)inject_filter_ref == filter)
 847                                                         seen = 1;
 848                                         } else if (filter->ipf_filter.ipf_output) {
 849                                                 errno_t result;
 850                                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
 851                                                 if (result == EJUSTRETURN) {
 852                                                         ipf_unref();
 853                                                         INM_REMREF(inm);
 854                                                         goto done;
 855                                                 }
 856                                                 if (result != 0) {
 857                                                         ipf_unref();
 858                                                         INM_REMREF(inm);
 859                                                         goto bad;
 860                                                 }
 861                                         }
 862                                 }
 863
 864                                 /* set back to host byte order */
 865                                 ip = mtod(m, struct ip *);
 866
 867 #if BYTE_ORDER != BIG_ENDIAN
 868                                 NTOHS(ip->ip_len);
 869                                 NTOHS(ip->ip_off);
 870 #endif
 871
 872                                 ipf_unref();
 873                                 didfilter = 1;
 874                         }
 875                         ip_mloopback(ifp, m, dst, hlen);
 876                 }
 877 #if MROUTING
 878                 else {
 879                         /*
 880                          * If we are acting as a multicast router, perform
 881                          * multicast forwarding as if the packet had just
 882                          * arrived on the interface to which we are about
 883                          * to send.  The multicast forwarding function
 884                          * recursively calls this function, using the
 885                          * IP_FORWARDING flag to prevent infinite recursion.
 886                          *
 887                          * Multicasts that are looped back by ip_mloopback(),
 888                          * above, will be forwarded by the ip_input() routine,
 889                          * if necessary.
 890                          */
 891                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
 892                                 /*
 893                                  * Check if rsvp daemon is running. If not, don't
 894                                  * set ip_moptions. This ensures that the packet
 895                                  * is multicast and not just sent down one link
 896                                  * as prescribed by rsvpd.
 897                                  */
 898                                 if (!rsvp_on)
 899                                         imo = NULL;
 900                                 if (ip_mforward(ip, ifp, m, imo) != 0) {
 901                                         m_freem(m);
 902                                         if (inm != NULL)
 903                                                 INM_REMREF(inm);
 904                                         goto done;
 905                                 }
 906                         }
 907                 }
 908 #endif /* MROUTING */
 909                 if (inm != NULL)
 910                         INM_REMREF(inm);
 911                 /*
 912                  * Multicasts with a time-to-live of zero may be looped-
 913                  * back, above, but must not be transmitted on a network.
 914                  * Also, multicasts addressed to the loopback interface
 915                  * are not sent -- the above call to ip_mloopback() will
 916                  * loop back a copy if this host actually belongs to the
 917                  * destination group on the loopback interface.
 918                  */
 919                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 920                         m_freem(m);
 921                         goto done;
 922                 }
 923
 924                 goto sendit;
 925         }
 926 #ifndef notdef
 927         /*
 928          * If source address not specified yet, use address
 929          * of outgoing interface.
 930          */
 931         if (ip->ip_src.s_addr == INADDR_ANY) {
 932                 IFA_LOCK_SPIN(&ia->ia_ifa);
 933                 ip->ip_src = IA_SIN(ia)->sin_addr;
 934                 IFA_UNLOCK(&ia->ia_ifa);
 935 #if IPFIREWALL_FORWARD
 936                 /* Keep note that we did this - if the firewall changes
 937                  * the next-hop, our interface may change, changing the
 938                  * default source IP. It's a shame so much effort happens
 939                  * twice. Oh well.
 940                  */
 941                 fwd_rewrite_src++;
 942 #endif /* IPFIREWALL_FORWARD */
 943         }
 944 #endif /* notdef */
 945
 946         /*
 947          * Look for broadcast address and
 948          * and verify user is allowed to send
 949          * such a packet.
 950          */
 951         if (isbroadcast) {
 952                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 953                         error = EADDRNOTAVAIL;
 954                         goto bad;
 955                 }
 956                 if ((flags & IP_ALLOWBROADCAST) == 0) {
 957                         error = EACCES;
 958                         goto bad;
 959                 }
 960                 /* don't allow broadcast messages to be fragmented */
 961                 if ((u_short)ip->ip_len > ifp->if_mtu) {
 962                         error = EMSGSIZE;
 963                         goto bad;
 964                 }
 965                 m->m_flags |= M_BCAST;
 966         } else {
 967                 m->m_flags &= ~M_BCAST;
 968         }
 969
 970 sendit:
 971 #if PF
 972         /* Invoke outbound packet filter */
 973         if ( PF_IS_ENABLED) {
 974                 int rc;
 975                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE);
 976                 if (rc != 0) {
 977                         if (packetlist == m0) {
 978                                 packetlist = m;
 979                                 mppn = NULL;
 980                         }
 981                         if (m != NULL) {
 982                                 m0 = m;
 983                                 /* Next packet in the chain */
 984                                 goto loopit;
 985                         } else if (packetlist != NULL) {
 986                                 /* No more packet; send down the chain */
 987                                 goto sendchain;
 988                         }
 989                         /* Nothing left; we're done */
 990                         goto done;
 991                 }
 992                 m0 = m;
 993                 ip = mtod(m, struct ip *);
 994                 pkt_dst = ip->ip_dst;
 995                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 996         }
 997 #endif /* PF */
 998         /*
 999          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1000          */
1001         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1002                 ip_linklocal_stat.iplls_out_total++;
1003                 if (ip->ip_ttl != MAXTTL) {
1004                         ip_linklocal_stat.iplls_out_badttl++;
1005                         ip->ip_ttl = MAXTTL;
1006                 }
1007         }
1008
1009         if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
1010                 struct ipfilter *filter;
1011                 int seen = (inject_filter_ref == 0);
1012                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1013
1014                 /* Check that a TSO frame isn't passed to a filter.
1015                  * This could happen if a filter is inserted while
1016                  * TCP is sending the TSO packet.
1017                  */
1018                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1019                         error = EMSGSIZE;
1020                         goto bad;
1021                 }
1022
1023                 ipf_ref();
1024
1025                 /* 4135317 - always pass network byte order to filter */
1026
1027 #if BYTE_ORDER != BIG_ENDIAN
1028                 HTONS(ip->ip_len);
1029                 HTONS(ip->ip_off);
1030 #endif
1031
1032                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1033                         if (seen == 0) {
1034                                 if ((struct ipfilter *)inject_filter_ref == filter)
1035                                         seen = 1;
1036                         } else if (filter->ipf_filter.ipf_output) {
1037                                 errno_t result;
1038                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1039                                 if (result == EJUSTRETURN) {
1040                                         ipf_unref();
1041                                         goto done;
1042                                 }
1043                                 if (result != 0) {
1044                                         ipf_unref();
1045                                         goto bad;
1046                                 }
1047                         }
1048                 }
1049
1050                 /* set back to host byte order */
1051                 ip = mtod(m, struct ip *);
1052
1053 #if BYTE_ORDER != BIG_ENDIAN
1054                 NTOHS(ip->ip_len);
1055                 NTOHS(ip->ip_off);
1056 #endif
1057
1058                 ipf_unref();
1059         }
1060
1061 #if IPSEC
1062         /* temporary for testing only: bypass ipsec alltogether */
1063
1064         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0)
1065                 goto skip_ipsec;
1066
1067         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
1068
1069
1070         /* get SP for this packet */
1071         if (so == NULL)
1072                 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
1073         else
1074                 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
1075
1076         if (sp == NULL) {
1077                 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1078                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1079                 goto bad;
1080         }
1081
1082         error = 0;
1083
1084         /* check policy */
1085         switch (sp->policy) {
1086         case IPSEC_POLICY_DISCARD:
1087         case IPSEC_POLICY_GENERATE:
1088                 /*
1089                  * This packet is just discarded.
1090                  */
1091                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1092                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0);
1093                 goto bad;
1094
1095         case IPSEC_POLICY_BYPASS:
1096         case IPSEC_POLICY_NONE:
1097                 /* no need to do IPsec. */
1098                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0);
1099                 goto skip_ipsec;
1100
1101         case IPSEC_POLICY_IPSEC:
1102                 if (sp->req == NULL) {
1103                         /* acquire a policy */
1104                         error = key_spdacquire(sp);
1105                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0);
1106                         goto bad;
1107                 }
1108                 break;
1109
1110         case IPSEC_POLICY_ENTRUST:
1111         default:
1112                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1113         }
1114     {
1115         struct ipsec_output_state state;
1116         bzero(&state, sizeof(state));
1117         state.m = m;
1118         if (flags & IP_ROUTETOIF) {
1119                 state.ro = &iproute;
1120                 bzero(&iproute, sizeof(iproute));
1121         } else
1122                 state.ro = ro;
1123         state.dst = (struct sockaddr *)dst;
1124
1125         ip->ip_sum = 0;
1126
1127         /*
1128          * XXX
1129          * delayed checksums are not currently compatible with IPsec
1130          */
1131         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1132                 in_delayed_cksum(m);
1133                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1134         }
1135
1136
1137 #if BYTE_ORDER != BIG_ENDIAN
1138         HTONS(ip->ip_len);
1139         HTONS(ip->ip_off);
1140 #endif
1141
1142         DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1143                 struct ip *, ip, struct ifnet *, ifp,
1144                 struct ip *, ip, struct ip6_hdr *, NULL);
1145
1146         error = ipsec4_output(&state, sp, flags);
1147
1148         m0 = m = state.m;
1149
1150         if (flags & IP_ROUTETOIF) {
1151                 /*
1152                  * if we have tunnel mode SA, we may need to ignore
1153                  * IP_ROUTETOIF.
1154                  */
1155                 if (state.ro != &iproute || state.ro->ro_rt != NULL) {
1156                         flags &= ~IP_ROUTETOIF;
1157                         ro = state.ro;
1158                 }
1159         } else
1160                 ro = state.ro;
1161
1162         dst = (struct sockaddr_in *)state.dst;
1163         if (error) {
1164                 /* mbuf is already reclaimed in ipsec4_output. */
1165                 m0 = NULL;
1166                 switch (error) {
1167                 case EHOSTUNREACH:
1168                 case ENETUNREACH:
1169                 case EMSGSIZE:
1170                 case ENOBUFS:
1171                 case ENOMEM:
1172                         break;
1173                 default:
1174                         printf("ip4_output (ipsec): error code %d\n", error);
1175                         /*fall through*/
1176                 case ENOENT:
1177                         /* don't show these error codes to the user */
1178                         error = 0;
1179                         break;
1180                 }
1181                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0);
1182                 goto bad;
1183         }
1184     }
1185
1186         /* be sure to update variables that are affected by ipsec4_output() */
1187         ip = mtod(m, struct ip *);
1188
1189 #ifdef _IP_VHL
1190         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1191 #else
1192         hlen = ip->ip_hl << 2;
1193 #endif
1194         /* Check that there wasn't a route change and src is still valid */
1195         if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) {
1196                 if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL &&
1197                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) {
1198                         error = EADDRNOTAVAIL;
1199                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1200                             5,0,0,0,0);
1201                         goto bad;
1202                 }
1203                 rtfree(ro->ro_rt);
1204                 ro->ro_rt = NULL;
1205                 if (src_ia != NULL)
1206                         IFA_REMREF(&src_ia->ia_ifa);
1207         }
1208
1209         if (ro->ro_rt == NULL) {
1210                 if ((flags & IP_ROUTETOIF) == 0) {
1211                         printf("ip_output: can't update route after "
1212                             "IPsec processing\n");
1213                         error = EHOSTUNREACH;   /*XXX*/
1214                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1215                             6,0,0,0,0);
1216                         goto bad;
1217                 }
1218         } else {
1219                 if (ia)
1220                         IFA_REMREF(&ia->ia_ifa);
1221                 RT_LOCK_SPIN(ro->ro_rt);
1222                 ia = ifatoia(ro->ro_rt->rt_ifa);
1223                 if (ia) {
1224                         /* Become a regular mutex */
1225                         RT_CONVERT_LOCK(ro->ro_rt);
1226                         IFA_ADDREF(&ia->ia_ifa);
1227                 }
1228                 ifp = ro->ro_rt->rt_ifp;
1229                 RT_UNLOCK(ro->ro_rt);
1230         }
1231
1232         /* make it flipped, again. */
1233
1234 #if BYTE_ORDER != BIG_ENDIAN
1235         NTOHS(ip->ip_len);
1236         NTOHS(ip->ip_off);
1237 #endif
1238
1239         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff);
1240
1241         /* Pass to filters again */
1242         if (!TAILQ_EMPTY(&ipv4_filters)) {
1243                 struct ipfilter *filter;
1244
1245                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1246
1247                 /* Check that a TSO frame isn't passed to a filter.
1248                  * This could happen if a filter is inserted while
1249                  * TCP is sending the TSO packet.
1250                  */
1251                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1252                         error = EMSGSIZE;
1253                         goto bad;
1254                 }
1255
1256                 ipf_ref();
1257
1258                 /* 4135317 - always pass network byte order to filter */
1259
1260 #if BYTE_ORDER != BIG_ENDIAN
1261                 HTONS(ip->ip_len);
1262                 HTONS(ip->ip_off);
1263 #endif
1264
1265                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1266                         if (filter->ipf_filter.ipf_output) {
1267                                 errno_t result;
1268                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1269                                 if (result == EJUSTRETURN) {
1270                                         ipf_unref();
1271                                         goto done;
1272                                 }
1273                                 if (result != 0) {
1274                                         ipf_unref();
1275                                         goto bad;
1276                                 }
1277                         }
1278                 }
1279
1280                 /* set back to host byte order */
1281                 ip = mtod(m, struct ip *);
1282
1283 #if BYTE_ORDER != BIG_ENDIAN
1284                 NTOHS(ip->ip_len);
1285                 NTOHS(ip->ip_off);
1286 #endif
1287
1288                 ipf_unref();
1289         }
1290 skip_ipsec:
1291 #endif /*IPSEC*/
1292
1293 #if IPFIREWALL
1294         /*
1295          * IpHack's section.
1296          * - Xlate: translate packet's addr/port (NAT).
1297          * - Firewall: deny/allow/etc.
1298          * - Wrap: fake packet's addr/port <unimpl.>
1299          * - Encapsulate: put it in another IP and send out. <unimp.>
1300          */
1301         if (fr_checkp) {
1302                 struct  mbuf    *m1 = m;
1303
1304                 if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) {
1305                         goto done;
1306                 }
1307                 ip = mtod(m0 = m = m1, struct ip *);
1308         }
1309
1310         /*
1311          * Check with the firewall...
1312          * but not if we are already being fwd'd from a firewall.
1313          */
1314         if (fw_enable && IPFW_LOADED && !args.next_hop) {
1315                 struct sockaddr_in *old = dst;
1316
1317                 args.m = m;
1318                 args.next_hop = dst;
1319                 args.oif = ifp;
1320                 off = ip_fw_chk_ptr(&args);
1321                 m = args.m;
1322                 dst = args.next_hop;
1323
1324                 /*
1325                  * On return we must do the following:
1326                  * IP_FW_PORT_DENY_FLAG         -> drop the pkt (XXX new)
1327                  * 1<=off<= 0xffff   -> DIVERT
1328                  * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1329                  * (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
1330                  * dst != old        -> IPFIREWALL_FORWARD
1331                  * off==0, dst==old  -> accept
1332                  * If some of the above modules is not compiled in, then
1333                  * we should't have to check the corresponding condition
1334                  * (because the ipfw control socket should not accept
1335                  * unsupported rules), but better play safe and drop
1336                  * packets in case of doubt.
1337                  */
1338                 m0 = m;
1339                 if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1340                         if (m)
1341                                 m_freem(m);
1342                         error = EACCES ;
1343                         goto done ;
1344                 }
1345                 ip = mtod(m, struct ip *);
1346
1347                 if (off == 0 && dst == old) {/* common case */
1348                         goto pass ;
1349                 }
1350 #if DUMMYNET
1351                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
1352                         /*
1353                          * pass the pkt to dummynet. Need to include
1354                          * pipe number, m, ifp, ro, dst because these are
1355                          * not recomputed in the next pass.
1356                          * All other parameters have been already used and
1357                          * so they are not needed anymore.
1358                          * XXX note: if the ifp or ro entry are deleted
1359                          * while a pkt is in dummynet, we are in trouble!
1360                          */
1361                         args.ro = ro;
1362                         args.dst = dst;
1363                         args.flags = flags;
1364                         if (flags & IP_OUTARGS)
1365                                 args.ipoa = ipoa;
1366
1367                         error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
1368                             &args);
1369                         goto done;
1370                 }
1371 #endif /* DUMMYNET */
1372 #if IPDIVERT
1373                 if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
1374                         struct mbuf *clone = NULL;
1375
1376                         /* Clone packet if we're doing a 'tee' */
1377                         if ((off & IP_FW_PORT_TEE_FLAG) != 0)
1378                                 clone = m_dup(m, M_DONTWAIT);
1379                         /*
1380                          * XXX
1381                          * delayed checksums are not currently compatible
1382                          * with divert sockets.
1383                          */
1384                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1385                                 in_delayed_cksum(m);
1386                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1387                         }
1388
1389                         /* Restore packet header fields to original values */
1390
1391 #if BYTE_ORDER != BIG_ENDIAN
1392                         HTONS(ip->ip_len);
1393                         HTONS(ip->ip_off);
1394 #endif
1395
1396                         /* Deliver packet to divert input routine */
1397                         divert_packet(m, 0, off & 0xffff, args.divert_rule);
1398
1399                         /* If 'tee', continue with original packet */
1400                         if (clone != NULL) {
1401                                 m0 = m = clone;
1402                                 ip = mtod(m, struct ip *);
1403                                 goto pass;
1404                         }
1405                         goto done;
1406                 }
1407 #endif
1408
1409 #if IPFIREWALL_FORWARD
1410                 /* Here we check dst to make sure it's directly reachable on the
1411                  * interface we previously thought it was.
1412                  * If it isn't (which may be likely in some situations) we have
1413                  * to re-route it (ie, find a route for the next-hop and the
1414                  * associated interface) and set them here. This is nested
1415                  * forwarding which in most cases is undesirable, except where
1416                  * such control is nigh impossible. So we do it here.
1417                  * And I'm babbling.
1418                  */
1419                 if (off == 0 && old != dst) {
1420                         struct in_ifaddr *ia_fw;
1421
1422                         /* It's changed... */
1423                         /* There must be a better way to do this next line... */
1424                         static struct route sro_fwd, *ro_fwd = &sro_fwd;
1425 #if IPFIREWALL_FORWARD_DEBUG
1426                         printf("IPFIREWALL_FORWARD: New dst ip: ");
1427                         print_ip(dst->sin_addr);
1428                         printf("\n");
1429 #endif
1430                         /*
1431                          * We need to figure out if we have been forwarded
1432                          * to a local socket. If so then we should somehow
1433                          * "loop back" to ip_input, and get directed to the
1434                          * PCB as if we had received this packet. This is
1435                          * because it may be dificult to identify the packets
1436                          * you want to forward until they are being output
1437                          * and have selected an interface. (e.g. locally
1438                          * initiated packets) If we used the loopback inteface,
1439                          * we would not be able to control what happens
1440                          * as the packet runs through ip_input() as
1441                          * it is done through a ISR.
1442                          */
1443                         lck_rw_lock_shared(in_ifaddr_rwlock);
1444                         TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1445                                 /*
1446                                  * If the addr to forward to is one
1447                                  * of ours, we pretend to
1448                                  * be the destination for this packet.
1449                                  */
1450                                 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1451                                 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1452                                     dst->sin_addr.s_addr) {
1453                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1454                                         break;
1455                                 }
1456                                 IFA_UNLOCK(&ia_fw->ia_ifa);
1457                         }
1458                         lck_rw_done(in_ifaddr_rwlock);
1459                         if (ia_fw) {
1460                                 /* tell ip_input "dont filter" */
1461                                 struct m_tag            *fwd_tag;
1462                                 struct ip_fwd_tag       *ipfwd_tag;
1463
1464                                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1465                                     KERNEL_TAG_TYPE_IPFORWARD,
1466                                     sizeof (*ipfwd_tag), M_NOWAIT, m);
1467                                 if (fwd_tag == NULL) {
1468                                         error = ENOBUFS;
1469                                         goto bad;
1470                                 }
1471
1472                                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1473                                 ipfwd_tag->next_hop = args.next_hop;
1474
1475                                 m_tag_prepend(m, fwd_tag);
1476
1477                                 if (m->m_pkthdr.rcvif == NULL)
1478                                         m->m_pkthdr.rcvif = lo_ifp;
1479                                 if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) &
1480                                                 m->m_pkthdr.csum_flags) == 0) {
1481                                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1482                                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1483                                                 m->m_pkthdr.csum_flags |=
1484                                                         CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1485                                                 m->m_pkthdr.csum_data = 0xffff;
1486                                         }
1487                                         m->m_pkthdr.csum_flags |=
1488                                                 CSUM_IP_CHECKED | CSUM_IP_VALID;
1489                                 }
1490                                 else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1491                                         in_delayed_cksum(m);
1492                                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1493                                         ip->ip_sum = in_cksum(m, hlen);
1494                                 }
1495
1496 #if BYTE_ORDER != BIG_ENDIAN
1497                                 HTONS(ip->ip_len);
1498                                 HTONS(ip->ip_off);
1499 #endif
1500
1501                                 /*  we need to call dlil_output to run filters
1502                                  *      and resync to avoid recursion loops.
1503                                  */
1504                                 if (lo_ifp) {
1505                                         dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0);
1506                                 }
1507                                 else {
1508                                         printf("ip_output: no loopback ifp for forwarding!!!\n");
1509                                 }
1510                                 goto done;
1511                         }
1512                         /* Some of the logic for this was
1513                          * nicked from above.
1514                          *
1515                          * This rewrites the cached route in a local PCB.
1516                          * Is this what we want to do?
1517                          */
1518                         bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
1519
1520                         ro_fwd->ro_rt = NULL;
1521                         rtalloc_ign(ro_fwd, RTF_PRCLONING);
1522
1523                         if (ro_fwd->ro_rt == NULL) {
1524                                 OSAddAtomic(1, &ipstat.ips_noroute);
1525                                 error = EHOSTUNREACH;
1526                                 goto bad;
1527                         }
1528
1529                         RT_LOCK_SPIN(ro_fwd->ro_rt);
1530                         ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1531                         if (ia_fw != NULL) {
1532                                 /* Become a regular mutex */
1533                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1534                                 IFA_ADDREF(&ia_fw->ia_ifa);
1535                         }
1536                         ifp = ro_fwd->ro_rt->rt_ifp;
1537                         ro_fwd->ro_rt->rt_use++;
1538                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1539                                 dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway;
1540                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1541                                 isbroadcast =
1542                                     (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1543                         } else {
1544                                 /* Become a regular mutex */
1545                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1546                                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
1547                         }
1548                         RT_UNLOCK(ro_fwd->ro_rt);
1549                         rtfree(ro->ro_rt);
1550                         ro->ro_rt = ro_fwd->ro_rt;
1551                         dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
1552
1553                         /*
1554                          * If we added a default src ip earlier,
1555                          * which would have been gotten from the-then
1556                          * interface, do it again, from the new one.
1557                          */
1558                         if (ia_fw != NULL) {
1559                                 if (fwd_rewrite_src) {
1560                                         IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1561                                         ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1562                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1563                                 }
1564                                 IFA_REMREF(&ia_fw->ia_ifa);
1565                         }
1566                         goto pass ;
1567                 }
1568 #endif /* IPFIREWALL_FORWARD */
1569                 /*
1570                  * if we get here, none of the above matches, and
1571                  * we have to drop the pkt
1572                  */
1573                 m_freem(m);
1574                 error = EACCES; /* not sure this is the right error msg */
1575                 goto done;
1576         }
1577
1578 pass:
1579 #endif /* IPFIREWALL */
1580 #if __APPLE__
1581         /* Do not allow loopback address to wind up on a wire */
1582         if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
1583                  ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1584                   (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1585                 OSAddAtomic(1, &ipstat.ips_badaddr);
1586                 m_freem(m);
1587                 /*
1588                  * Do not simply drop the packet just like a firewall -- we want the
1589                  * the application to feel the pain.
1590                  * Return ENETUNREACH like ip6_output does in some similar cases.
1591                  * This can startle the otherwise clueless process that specifies
1592                  * loopback as the source address.
1593                  */
1594                 error = ENETUNREACH;
1595                 goto done;
1596         }
1597 #endif
1598         m->m_pkthdr.csum_flags |= CSUM_IP;
1599         tso =  (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4);
1600
1601         sw_csum = m->m_pkthdr.csum_flags
1602                 & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1603
1604         if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) {
1605                 /*
1606                  * Special case code for GMACE
1607                  * frames that can be checksumed by GMACE SUM16 HW:
1608                  * frame >64, no fragments, no UDP
1609                  */
1610                 if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP)
1611                         && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) {
1612                         /* Apple GMAC HW, expects STUFF_OFFSET << 16  | START_OFFSET */
1613                         u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */
1614                         u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF;
1615                         m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */
1616                         m->m_pkthdr.csum_data = (csumprev + offset)  << 16 ;
1617                         m->m_pkthdr.csum_data += offset;
1618                 sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */
1619                 }
1620                 else {
1621                         /* let the software handle any UDP or TCP checksums */
1622                         sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
1623                 }
1624         } else if (apple_hwcksum_tx == 0) {
1625                 sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
1626                     m->m_pkthdr.csum_flags;
1627         }
1628
1629         if (sw_csum & CSUM_DELAY_DATA) {
1630                 in_delayed_cksum(m);
1631                 sw_csum &= ~CSUM_DELAY_DATA;
1632                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1633         }
1634
1635         if (apple_hwcksum_tx != 0) {
1636                 m->m_pkthdr.csum_flags &=
1637                     IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1638         } else {
1639                 m->m_pkthdr.csum_flags = 0;
1640         }
1641
1642         /*
1643          * If small enough for interface, or the interface will take
1644          * care of the fragmentation for us, can just send directly.
1645          */
1646         if ((u_short)ip->ip_len <= ifp->if_mtu || tso ||
1647             ifp->if_hwassist & CSUM_FRAGMENT) {
1648                 if (tso)
1649                         m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
1650
1651
1652 #if BYTE_ORDER != BIG_ENDIAN
1653                 HTONS(ip->ip_len);
1654                 HTONS(ip->ip_off);
1655 #endif
1656
1657                 ip->ip_sum = 0;
1658                 if (sw_csum & CSUM_DELAY_IP) {
1659                         ip->ip_sum = in_cksum(m, hlen);
1660                 }
1661
1662 #ifndef __APPLE__
1663                 /* Record statistics for this interface address. */
1664                 if (!(flags & IP_FORWARDING) && ia != NULL) {
1665                         ia->ia_ifa.if_opackets++;
1666                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1667                 }
1668 #endif
1669
1670 #if IPSEC
1671                 /* clean ipsec history once it goes out of the node */
1672                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1673                         ipsec_delaux(m);
1674 #endif
1675                 if (packetchain == 0) {
1676                         if (ro->ro_rt && nstat_collect)
1677                                 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1678                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1679                             (struct sockaddr *)dst);
1680                         goto done;
1681                 }
1682                 else { /* packet chaining allows us to reuse the route for all packets */
1683                         bytecnt += m->m_pkthdr.len;
1684                         mppn = &m->m_nextpkt;
1685                         m = m->m_nextpkt;
1686                         if (m == NULL) {
1687 #if PF
1688 sendchain:
1689 #endif /* PF */
1690                                 if (pktcnt > ip_maxchainsent)
1691                                         ip_maxchainsent = pktcnt;
1692                                 if (ro->ro_rt && nstat_collect)
1693                                         nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0);
1694                                 //send
1695                                 error = ifnet_output(ifp, PF_INET, packetlist,
1696                                     ro->ro_rt, (struct sockaddr *)dst);
1697                                 pktcnt = 0;
1698                                 bytecnt = 0;
1699                                 goto done;
1700
1701                         }
1702                         m0 = m;
1703                         pktcnt++;
1704                         goto loopit;
1705                 }
1706         }
1707         /*
1708          * Too large for interface; fragment if possible.
1709          * Must be able to put at least 8 bytes per fragment.
1710          */
1711
1712         if (ip->ip_off & IP_DF  || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) ||
1713                 pktcnt > 0) {
1714                 error = EMSGSIZE;
1715                 /*
1716                  * This case can happen if the user changed the MTU
1717                  * of an interface after enabling IP on it.  Because
1718                  * most netifs don't keep track of routes pointing to
1719                  * them, there is no way for one to update all its
1720                  * routes when the MTU is changed.
1721                  */
1722                 if (ro->ro_rt) {
1723                         RT_LOCK_SPIN(ro->ro_rt);
1724                         if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
1725                             && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
1726                             && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1727                                 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1728                         }
1729                         RT_UNLOCK(ro->ro_rt);
1730                 }
1731                 if (pktcnt > 0) {
1732                         m0 = packetlist;
1733                 }
1734                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1735                 goto bad;
1736         }
1737
1738         error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
1739         if (error != 0) {
1740                 m0 = m = NULL;
1741                 goto bad;
1742         }
1743
1744         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1745                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1746
1747         for (m = m0; m; m = m0) {
1748                 m0 = m->m_nextpkt;
1749                 m->m_nextpkt = 0;
1750 #if IPSEC
1751                 /* clean ipsec history once it goes out of the node */
1752                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1753                         ipsec_delaux(m);
1754 #endif
1755                 if (error == 0) {
1756 #ifndef __APPLE__
1757                         /* Record statistics for this interface address. */
1758                         if (ia != NULL) {
1759                                 ia->ia_ifa.if_opackets++;
1760                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1761                         }
1762 #endif
1763                         if ((packetchain != 0)  && (pktcnt > 0))
1764                                 panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist);
1765                         if (ro->ro_rt && nstat_collect)
1766                                 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1767                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1768                             (struct sockaddr *)dst);
1769                 } else
1770                         m_freem(m);
1771         }
1772
1773         if (error == 0)
1774                 OSAddAtomic(1, &ipstat.ips_fragmented);
1775
1776 done:
1777         if (ia) {
1778                 IFA_REMREF(&ia->ia_ifa);
1779                 ia = NULL;
1780         }
1781 #if IPSEC
1782         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
1783         if (ro == &iproute && ro->ro_rt) {
1784                 rtfree(ro->ro_rt);
1785                 ro->ro_rt = NULL;
1786         }
1787         if (sp != NULL) {
1788                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1789                         printf("DP ip_output call free SP:%x\n", sp));
1790                 key_freesp(sp, KEY_SADB_UNLOCKED);
1791         }
1792         }
1793 #endif /* IPSEC */
1794
1795         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0);
1796         return (error);
1797 bad:
1798         m_freem(m0);
1799         goto done;
1800 }
1801
1802 int
1803 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
1804 {
1805         struct ip *ip, *mhip;
1806         int len, hlen, mhlen, firstlen, off, error = 0;
1807         struct mbuf **mnext = &m->m_nextpkt, *m0;
1808         int nfrags = 1;
1809
1810         ip = mtod(m, struct ip *);
1811 #ifdef _IP_VHL
1812         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1813 #else
1814         hlen = ip->ip_hl << 2;
1815 #endif
1816
1817         firstlen = len = (mtu - hlen) &~ 7;
1818         if (len < 8) {
1819                 m_freem(m);
1820                 return (EMSGSIZE);
1821         }
1822
1823         /*
1824          * if the interface will not calculate checksums on
1825          * fragmented packets, then do it here.
1826          */
1827         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1828             (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
1829                 in_delayed_cksum(m);
1830                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1831         }
1832
1833         /*
1834          * Loop through length of segment after first fragment,
1835          * make new header and copy data of each part and link onto chain.
1836          */
1837         m0 = m;
1838         mhlen = sizeof (struct ip);
1839         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1840                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1841                 if (m == 0) {
1842                         error = ENOBUFS;
1843                         OSAddAtomic(1, &ipstat.ips_odropped);
1844                         goto sendorfree;
1845                 }
1846                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1847                 m->m_data += max_linkhdr;
1848                 mhip = mtod(m, struct ip *);
1849                 *mhip = *ip;
1850                 if (hlen > sizeof (struct ip)) {
1851                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1852                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1853                 }
1854                 m->m_len = mhlen;
1855                 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
1856                 if (ip->ip_off & IP_MF)
1857                         mhip->ip_off |= IP_MF;
1858                 if (off + len >= (u_short)ip->ip_len)
1859                         len = (u_short)ip->ip_len - off;
1860                 else
1861                         mhip->ip_off |= IP_MF;
1862                 mhip->ip_len = htons((u_short)(len + mhlen));
1863                 m->m_next = m_copy(m0, off, len);
1864                 if (m->m_next == 0) {
1865                         (void) m_free(m);
1866                         error = ENOBUFS;        /* ??? */
1867                         OSAddAtomic(1, &ipstat.ips_odropped);
1868                         goto sendorfree;
1869                 }
1870                 m->m_pkthdr.len = mhlen + len;
1871                 m->m_pkthdr.rcvif = 0;
1872                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1873                 m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id;
1874 #if CONFIG_MACF_NET
1875                 mac_netinet_fragment(m0, m);
1876 #endif
1877
1878 #if BYTE_ORDER != BIG_ENDIAN
1879                 HTONS(mhip->ip_off);
1880 #endif
1881
1882                 mhip->ip_sum = 0;
1883                 if (sw_csum & CSUM_DELAY_IP) {
1884                         mhip->ip_sum = in_cksum(m, mhlen);
1885                 }
1886                 *mnext = m;
1887                 mnext = &m->m_nextpkt;
1888                 nfrags++;
1889         }
1890         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1891
1892         /* set first/last markers for fragment chain */
1893         m->m_flags |= M_LASTFRAG;
1894         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1895         m0->m_pkthdr.csum_data = nfrags;
1896
1897         /*
1898          * Update first fragment by trimming what's been copied out
1899          * and updating header, then send each fragment (in order).
1900          */
1901         m = m0;
1902         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1903         m->m_pkthdr.len = hlen + firstlen;
1904         ip->ip_len = htons((u_short)m->m_pkthdr.len);
1905         ip->ip_off |= IP_MF;
1906
1907 #if BYTE_ORDER != BIG_ENDIAN
1908         HTONS(ip->ip_off);
1909 #endif
1910
1911         ip->ip_sum = 0;
1912         if (sw_csum & CSUM_DELAY_IP) {
1913                 ip->ip_sum = in_cksum(m, hlen);
1914         }
1915 sendorfree:
1916         if (error)
1917                 m_freem_list(m0);
1918
1919         return (error);
1920 }
1921
1922 static void
1923 ip_out_cksum_stats(int proto, u_int32_t len)
1924 {
1925         switch (proto) {
1926         case IPPROTO_TCP:
1927                 tcp_out_cksum_stats(len);
1928                 break;
1929         case IPPROTO_UDP:
1930                 udp_out_cksum_stats(len);
1931                 break;
1932         default:
1933                 /* keep only TCP or UDP stats for now */
1934                 break;
1935         }
1936 }
1937
1938 void
1939 in_delayed_cksum_offset(struct mbuf *m0, int ip_offset)
1940 {
1941         struct ip *ip;
1942         unsigned char buf[sizeof(struct ip)];
1943         u_short csum, offset, ip_len;
1944
1945         /* Save copy of first mbuf pointer and the ip_offset before modifying */
1946         struct mbuf *m = m0;
1947         int ip_offset_copy = ip_offset;
1948
1949         while (ip_offset >= m->m_len) {
1950                 ip_offset -= m->m_len;
1951                 m = m->m_next;
1952                 if (m == NULL) {
1953                         printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n");
1954                         return;
1955                 }
1956         }
1957
1958         /* Sometimes the IP header is not contiguous, yes this can happen! */
1959         if (ip_offset + sizeof(struct ip) > m->m_len) {
1960 #if DEBUG
1961                 printf("delayed m_pullup, m->len: %d  off: %d\n",
1962                         m->m_len, ip_offset);
1963 #endif
1964                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
1965
1966                 ip = (struct ip *)buf;
1967         } else {
1968                 ip = (struct ip*)(m->m_data + ip_offset);
1969         }
1970
1971         /* Gross */
1972         if (ip_offset) {
1973                 m->m_len -= ip_offset;
1974                 m->m_data += ip_offset;
1975         }
1976
1977         offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
1978
1979         /*
1980          * We could be in the context of an IP or interface filter; in the
1981          * former case, ip_len would be in host (correct) order while for
1982          * the latter it would be in network order.  Because of this, we
1983          * attempt to interpret the length field by comparing it against
1984          * the actual packet length.  If the comparison fails, byte swap
1985          * the length and check again.  If it still fails, then the packet
1986          * is bogus and we give up.
1987          */
1988         ip_len = ip->ip_len;
1989         if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
1990                 ip_len = SWAP16(ip_len);
1991                 if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
1992                         printf("in_delayed_cksum_offset: ip_len %d (%d) "
1993                             "doesn't match actual length %d\n", ip->ip_len,
1994                             ip_len, (m0->m_pkthdr.len - ip_offset_copy));
1995                         return;
1996                 }
1997         }
1998
1999         csum = in_cksum_skip(m, ip_len, offset);
2000
2001         /* Update stats */
2002         ip_out_cksum_stats(ip->ip_p, ip_len - offset);
2003
2004         if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
2005                 csum = 0xffff;
2006         offset += m0->m_pkthdr.csum_data & 0xFFFF;        /* checksum offset */
2007
2008         /* Gross */
2009         if (ip_offset) {
2010                 if (M_LEADINGSPACE(m) < ip_offset)
2011                         panic("in_delayed_cksum_offset - chain modified!\n");
2012                 m->m_len += ip_offset;
2013                 m->m_data -= ip_offset;
2014         }
2015
2016         if (offset > ip_len) /* bogus offset */
2017                 return;
2018
2019         /* Insert the checksum in the existing chain */
2020         if (offset + ip_offset + sizeof(u_short) > m->m_len) {
2021                 char tmp[2];
2022
2023 #if DEBUG
2024                 printf("delayed m_copyback, m->len: %d  off: %d  p: %d\n",
2025                     m->m_len, offset + ip_offset, ip->ip_p);
2026 #endif
2027                 *(u_short *)tmp = csum;
2028                 m_copyback(m, offset + ip_offset, 2, tmp);
2029         } else
2030                 *(u_short *)(m->m_data + offset + ip_offset) = csum;
2031 }
2032
2033 void
2034 in_delayed_cksum(struct mbuf *m)
2035 {
2036         in_delayed_cksum_offset(m, 0);
2037 }
2038
2039 void
2040 in_cksum_offset(struct mbuf* m, size_t ip_offset)
2041 {
2042         struct ip* ip = NULL;
2043         int hlen = 0;
2044         unsigned char buf[sizeof(struct ip)];
2045         int swapped = 0;
2046
2047         /* Save copy of first mbuf pointer and the ip_offset before modifying */
2048         struct mbuf* m0 = m;
2049         size_t ip_offset_copy = ip_offset;
2050
2051         while (ip_offset >= m->m_len) {
2052                 ip_offset -= m->m_len;
2053                 m = m->m_next;
2054                 if (m == NULL) {
2055                         printf("in_cksum_offset failed - ip_offset wasn't in the packet\n");
2056                         return;
2057                 }
2058         }
2059
2060         /* Sometimes the IP header is not contiguous, yes this can happen! */
2061         if (ip_offset + sizeof(struct ip) > m->m_len) {
2062
2063 #if DEBUG
2064                 printf("in_cksum_offset - delayed m_pullup, m->len: %d  off: %lu\n",
2065                         m->m_len, ip_offset);
2066 #endif
2067                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
2068
2069                 ip = (struct ip *)buf;
2070                 ip->ip_sum = 0;
2071                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum);
2072         } else {
2073                 ip = (struct ip*)(m->m_data + ip_offset);
2074                 ip->ip_sum = 0;
2075         }
2076
2077         /* Gross */
2078         if (ip_offset) {
2079                 m->m_len -= ip_offset;
2080                 m->m_data += ip_offset;
2081         }
2082
2083 #ifdef _IP_VHL
2084         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2085 #else
2086         hlen = ip->ip_hl << 2;
2087 #endif
2088         /*
2089          * We could be in the context of an IP or interface filter; in the
2090          * former case, ip_len would be in host order while for the latter
2091          * it would be in network (correct) order.  Because of this, we
2092          * attempt to interpret the length field by comparing it against
2093          * the actual packet length.  If the comparison fails, byte swap
2094          * the length and check again.  If it still fails, then the packet
2095          * is bogus and we give up.
2096          */
2097         if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2098                 ip->ip_len = SWAP16(ip->ip_len);
2099                 swapped = 1;
2100                 if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2101                         ip->ip_len = SWAP16(ip->ip_len);
2102                         printf("in_cksum_offset: ip_len %d (%d) "
2103                             "doesn't match actual length %lu\n",
2104                             ip->ip_len, SWAP16(ip->ip_len),
2105                             (m0->m_pkthdr.len - ip_offset_copy));
2106                         return;
2107                 }
2108         }
2109
2110         ip->ip_sum = 0;
2111         ip->ip_sum = in_cksum(m, hlen);
2112         if (swapped)
2113                 ip->ip_len = SWAP16(ip->ip_len);
2114
2115         /* Gross */
2116         if (ip_offset) {
2117                 if (M_LEADINGSPACE(m) < ip_offset)
2118                         panic("in_cksum_offset - chain modified!\n");
2119                 m->m_len += ip_offset;
2120                 m->m_data -= ip_offset;
2121         }
2122
2123         /* Insert the checksum in the existing chain if IP header not contiguous */
2124         if (ip_offset + sizeof(struct ip) > m->m_len) {
2125                 char tmp[2];
2126
2127 #if DEBUG
2128                 printf("in_cksum_offset m_copyback, m->len: %u  off: %lu  p: %d\n",
2129                     m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p);
2130 #endif
2131                 *(u_short *)tmp = ip->ip_sum;
2132                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp);
2133         }
2134 }
2135
2136 /*
2137  * Insert IP options into preformed packet.
2138  * Adjust IP destination as required for IP source routing,
2139  * as indicated by a non-zero in_addr at the start of the options.
2140  *
2141  * XXX This routine assumes that the packet has no options in place.
2142  */
2143 static struct mbuf *
2144 ip_insertoptions(m, opt, phlen)
2145         register struct mbuf *m;
2146         struct mbuf *opt;
2147         int *phlen;
2148 {
2149         register struct ipoption *p = mtod(opt, struct ipoption *);
2150         struct mbuf *n;
2151         register struct ip *ip = mtod(m, struct ip *);
2152         unsigned optlen;
2153
2154         optlen = opt->m_len - sizeof(p->ipopt_dst);
2155         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
2156                 return (m);             /* XXX should fail */
2157         if (p->ipopt_dst.s_addr)
2158                 ip->ip_dst = p->ipopt_dst;
2159         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2160                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2161                 if (n == 0)
2162                         return (m);
2163                 n->m_pkthdr.rcvif = 0;
2164 #if CONFIG_MACF_NET
2165                 mac_mbuf_label_copy(m, n);
2166 #endif
2167                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2168                 m->m_len -= sizeof(struct ip);
2169                 m->m_data += sizeof(struct ip);
2170                 n->m_next = m;
2171                 m = n;
2172                 m->m_len = optlen + sizeof(struct ip);
2173                 m->m_data += max_linkhdr;
2174                 (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
2175         } else {
2176                 m->m_data -= optlen;
2177                 m->m_len += optlen;
2178                 m->m_pkthdr.len += optlen;
2179                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2180         }
2181         ip = mtod(m, struct ip *);
2182         bcopy(p->ipopt_list, ip + 1, optlen);
2183         *phlen = sizeof(struct ip) + optlen;
2184         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2185         ip->ip_len += optlen;
2186         return (m);
2187 }
2188
2189 /*
2190  * Copy options from ip to jp,
2191  * omitting those not copied during fragmentation.
2192  */
2193 int
2194 ip_optcopy(ip, jp)
2195         struct ip *ip, *jp;
2196 {
2197         register u_char *cp, *dp;
2198         int opt, optlen, cnt;
2199
2200         cp = (u_char *)(ip + 1);
2201         dp = (u_char *)(jp + 1);
2202         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2203         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2204                 opt = cp[0];
2205                 if (opt == IPOPT_EOL)
2206                         break;
2207                 if (opt == IPOPT_NOP) {
2208                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2209                         *dp++ = IPOPT_NOP;
2210                         optlen = 1;
2211                         continue;
2212                 }
2213 #if DIAGNOSTIC
2214                 if (cnt < IPOPT_OLEN + sizeof(*cp))
2215                         panic("malformed IPv4 option passed to ip_optcopy");
2216 #endif
2217                 optlen = cp[IPOPT_OLEN];
2218 #if DIAGNOSTIC
2219                 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2220                         panic("malformed IPv4 option passed to ip_optcopy");
2221 #endif
2222                 /* bogus lengths should have been caught by ip_dooptions */
2223                 if (optlen > cnt)
2224                         optlen = cnt;
2225                 if (IPOPT_COPIED(opt)) {
2226                         bcopy(cp, dp, optlen);
2227                         dp += optlen;
2228                 }
2229         }
2230         for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2231                 *dp++ = IPOPT_EOL;
2232         return (optlen);
2233 }
2234
2235 /*
2236  * IP socket option processing.
2237  */
2238 int
2239 ip_ctloutput(so, sopt)
2240         struct socket *so;
2241         struct sockopt *sopt;
2242 {
2243         struct  inpcb *inp = sotoinpcb(so);
2244         int     error, optval;
2245
2246         error = optval = 0;
2247         if (sopt->sopt_level != IPPROTO_IP) {
2248                 return (EINVAL);
2249         }
2250
2251         switch (sopt->sopt_dir) {
2252         case SOPT_SET:
2253                 switch (sopt->sopt_name) {
2254                 case IP_OPTIONS:
2255 #ifdef notyet
2256                 case IP_RETOPTS:
2257 #endif
2258                 {
2259                         struct mbuf *m;
2260                         if (sopt->sopt_valsize > MLEN) {
2261                                 error = EMSGSIZE;
2262                                 break;
2263                         }
2264                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2265                             MT_HEADER);
2266                         if (m == 0) {
2267                                 error = ENOBUFS;
2268                                 break;
2269                         }
2270                         m->m_len = sopt->sopt_valsize;
2271                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
2272                                             m->m_len);
2273                         if (error)
2274                                 break;
2275
2276                         return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
2277                                            m));
2278                 }
2279
2280                 case IP_TOS:
2281                 case IP_TTL:
2282                 case IP_RECVOPTS:
2283                 case IP_RECVRETOPTS:
2284                 case IP_RECVDSTADDR:
2285                 case IP_RECVIF:
2286                 case IP_RECVTTL:
2287 #if defined(NFAITH) && NFAITH > 0
2288                 case IP_FAITH:
2289 #endif
2290                 case IP_RECVPKTINFO:
2291                         error = sooptcopyin(sopt, &optval, sizeof optval,
2292                                             sizeof optval);
2293                         if (error)
2294                                 break;
2295
2296                         switch (sopt->sopt_name) {
2297                         case IP_TOS:
2298                                 inp->inp_ip_tos = optval;
2299                                 break;
2300
2301                         case IP_TTL:
2302                                 inp->inp_ip_ttl = optval;
2303                                 break;
2304 #define OPTSET(bit) \
2305         if (optval) \
2306                 inp->inp_flags |= bit; \
2307         else \
2308                 inp->inp_flags &= ~bit;
2309
2310                         case IP_RECVOPTS:
2311                                 OPTSET(INP_RECVOPTS);
2312                                 break;
2313
2314                         case IP_RECVRETOPTS:
2315                                 OPTSET(INP_RECVRETOPTS);
2316                                 break;
2317
2318                         case IP_RECVDSTADDR:
2319                                 OPTSET(INP_RECVDSTADDR);
2320                                 break;
2321
2322                         case IP_RECVIF:
2323                                 OPTSET(INP_RECVIF);
2324                                 break;
2325
2326                         case IP_RECVTTL:
2327                                 OPTSET(INP_RECVTTL);
2328                                 break;
2329
2330 #if defined(NFAITH) && NFAITH > 0
2331                         case IP_FAITH:
2332                                 OPTSET(INP_FAITH);
2333                                 break;
2334 #endif
2335                         case IP_RECVPKTINFO:
2336                                 OPTSET(INP_PKTINFO);
2337                                 break;
2338                         }
2339                         break;
2340 #undef OPTSET
2341
2342 #if CONFIG_FORCE_OUT_IFP
2343                 /*
2344                  * Apple private interface, similar to IP_BOUND_IF, except
2345                  * that the parameter is a NULL-terminated string containing
2346                  * the name of the network interface; an emptry string means
2347                  * unbind.  Applications are encouraged to use IP_BOUND_IF
2348                  * instead, as that is the current "official" API.
2349                  */
2350                 case IP_FORCE_OUT_IFP: {
2351                         char ifname[IFNAMSIZ];
2352                         unsigned int ifscope;
2353
2354                         /* This option is settable only for IPv4 */
2355                         if (!(inp->inp_vflag & INP_IPV4)) {
2356                                 error = EINVAL;
2357                                 break;
2358                         }
2359
2360                         /* Verify interface name parameter is sane */
2361                         if (sopt->sopt_valsize > sizeof(ifname)) {
2362                                 error = EINVAL;
2363                                 break;
2364                         }
2365
2366                         /* Copy the interface name */
2367                         if (sopt->sopt_valsize != 0) {
2368                                 error = sooptcopyin(sopt, ifname,
2369                                     sizeof (ifname), sopt->sopt_valsize);
2370                                 if (error)
2371                                         break;
2372                         }
2373
2374                         if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
2375                                 /* Unbind this socket from any interface */
2376                                 ifscope = IFSCOPE_NONE;
2377                         } else {
2378                                 ifnet_t ifp;
2379
2380                                 /* Verify name is NULL terminated */
2381                                 if (ifname[sopt->sopt_valsize - 1] != '\0') {
2382                                         error = EINVAL;
2383                                         break;
2384                                 }
2385
2386                                 /* Bail out if given bogus interface name */
2387                                 if (ifnet_find_by_name(ifname, &ifp) != 0) {
2388                                         error = ENXIO;
2389                                         break;
2390                                 }
2391
2392                                 /* Bind this socket to this interface */
2393                                 ifscope = ifp->if_index;
2394
2395                                 /*
2396                                  * Won't actually free; since we don't release
2397                                  * this later, we should do it now.
2398                                  */
2399                                 ifnet_release(ifp);
2400                         }
2401                         inp_bindif(inp, ifscope);
2402                 }
2403                 break;
2404 #endif
2405                 /*
2406                  * Multicast socket options are processed by the in_mcast
2407                  * module.
2408                  */
2409                 case IP_MULTICAST_IF:
2410                 case IP_MULTICAST_IFINDEX:
2411                 case IP_MULTICAST_VIF:
2412                 case IP_MULTICAST_TTL:
2413                 case IP_MULTICAST_LOOP:
2414                 case IP_ADD_MEMBERSHIP:
2415                 case IP_DROP_MEMBERSHIP:
2416                 case IP_ADD_SOURCE_MEMBERSHIP:
2417                 case IP_DROP_SOURCE_MEMBERSHIP:
2418                 case IP_BLOCK_SOURCE:
2419                 case IP_UNBLOCK_SOURCE:
2420                 case IP_MSFILTER:
2421                 case MCAST_JOIN_GROUP:
2422                 case MCAST_LEAVE_GROUP:
2423                 case MCAST_JOIN_SOURCE_GROUP:
2424                 case MCAST_LEAVE_SOURCE_GROUP:
2425                 case MCAST_BLOCK_SOURCE:
2426                 case MCAST_UNBLOCK_SOURCE:
2427                         error = inp_setmoptions(inp, sopt);
2428                         break;
2429
2430                 case IP_PORTRANGE:
2431                         error = sooptcopyin(sopt, &optval, sizeof optval,
2432                                             sizeof optval);
2433                         if (error)
2434                                 break;
2435
2436                         switch (optval) {
2437                         case IP_PORTRANGE_DEFAULT:
2438                                 inp->inp_flags &= ~(INP_LOWPORT);
2439                                 inp->inp_flags &= ~(INP_HIGHPORT);
2440                                 break;
2441
2442                         case IP_PORTRANGE_HIGH:
2443                                 inp->inp_flags &= ~(INP_LOWPORT);
2444                                 inp->inp_flags |= INP_HIGHPORT;
2445                                 break;
2446
2447                         case IP_PORTRANGE_LOW:
2448                                 inp->inp_flags &= ~(INP_HIGHPORT);
2449                                 inp->inp_flags |= INP_LOWPORT;
2450                                 break;
2451
2452                         default:
2453                                 error = EINVAL;
2454                                 break;
2455                         }
2456                         break;
2457
2458 #if IPSEC
2459                 case IP_IPSEC_POLICY:
2460                 {
2461                         caddr_t req = NULL;
2462                         size_t len = 0;
2463                         int priv;
2464                         struct mbuf *m;
2465                         int optname;
2466
2467                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2468                                 break;
2469                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2470                                 break;
2471                         priv = (proc_suser(sopt->sopt_p) == 0);
2472                         if (m) {
2473                                 req = mtod(m, caddr_t);
2474                                 len = m->m_len;
2475                         }
2476                         optname = sopt->sopt_name;
2477                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2478                         m_freem(m);
2479                         break;
2480                 }
2481 #endif /*IPSEC*/
2482
2483 #if TRAFFIC_MGT
2484                 case IP_TRAFFIC_MGT_BACKGROUND:
2485                 {
2486                         unsigned        background = 0;
2487                         error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background));
2488                         if (error)
2489                                 break;
2490
2491                         if (background) {
2492                                 socket_set_traffic_mgt_flags_locked(so,
2493                                     TRAFFIC_MGT_SO_BACKGROUND);
2494                         } else {
2495                                 socket_clear_traffic_mgt_flags_locked(so,
2496                                     TRAFFIC_MGT_SO_BACKGROUND);
2497                         }
2498
2499                         break;
2500                 }
2501 #endif /* TRAFFIC_MGT */
2502
2503                 /*
2504                  * On a multihomed system, scoped routing can be used to
2505                  * restrict the source interface used for sending packets.
2506                  * The socket option IP_BOUND_IF binds a particular AF_INET
2507                  * socket to an interface such that data sent on the socket
2508                  * is restricted to that interface.  This is unlike the
2509                  * SO_DONTROUTE option where the routing table is bypassed;
2510                  * therefore it allows for a greater flexibility and control
2511                  * over the system behavior, and does not place any restriction
2512                  * on the destination address type (e.g.  unicast, multicast,
2513                  * or broadcast if applicable) or whether or not the host is
2514                  * directly reachable.  Note that in the multicast transmit
2515                  * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2516                  * IP_BOUND_IF, since the former practically bypasses the
2517                  * routing table; in this case, IP_BOUND_IF sets the default
2518                  * interface used for sending multicast packets in the absence
2519                  * of an explicit multicast transmit interface.
2520                  */
2521                 case IP_BOUND_IF:
2522                         /* This option is settable only for IPv4 */
2523                         if (!(inp->inp_vflag & INP_IPV4)) {
2524                                 error = EINVAL;
2525                                 break;
2526                         }
2527
2528                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2529                             sizeof (optval));
2530
2531                         if (error)
2532                                 break;
2533
2534                         inp_bindif(inp, optval);
2535                         break;
2536
2537                 case IP_NO_IFT_CELLULAR:
2538                         /* This option is settable only for IPv4 */
2539                         if (!(inp->inp_vflag & INP_IPV4)) {
2540                                 error = EINVAL;
2541                                 break;
2542                         }
2543
2544                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2545                             sizeof (optval));
2546
2547                         if (error)
2548                                 break;
2549
2550                         error = inp_nocellular(inp, optval);
2551                         break;
2552
2553                 case IP_OUT_IF:
2554                         /* This option is not settable */
2555                         error = EINVAL;
2556                         break;
2557
2558                 default:
2559                         error = ENOPROTOOPT;
2560                         break;
2561                 }
2562                 break;
2563
2564         case SOPT_GET:
2565                 switch (sopt->sopt_name) {
2566                 case IP_OPTIONS:
2567                 case IP_RETOPTS:
2568                         if (inp->inp_options)
2569                                 error = sooptcopyout(sopt,
2570                                                      mtod(inp->inp_options,
2571                                                           char *),
2572                                                      inp->inp_options->m_len);
2573                         else
2574                                 sopt->sopt_valsize = 0;
2575                         break;
2576
2577                 case IP_TOS:
2578                 case IP_TTL:
2579                 case IP_RECVOPTS:
2580                 case IP_RECVRETOPTS:
2581                 case IP_RECVDSTADDR:
2582                 case IP_RECVIF:
2583                 case IP_RECVTTL:
2584                 case IP_PORTRANGE:
2585 #if defined(NFAITH) && NFAITH > 0
2586                 case IP_FAITH:
2587 #endif
2588                 case IP_RECVPKTINFO:
2589                         switch (sopt->sopt_name) {
2590
2591                         case IP_TOS:
2592                                 optval = inp->inp_ip_tos;
2593                                 break;
2594
2595                         case IP_TTL:
2596                                 optval = inp->inp_ip_ttl;
2597                                 break;
2598
2599 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2600
2601                         case IP_RECVOPTS:
2602                                 optval = OPTBIT(INP_RECVOPTS);
2603                                 break;
2604
2605                         case IP_RECVRETOPTS:
2606                                 optval = OPTBIT(INP_RECVRETOPTS);
2607                                 break;
2608
2609                         case IP_RECVDSTADDR:
2610                                 optval = OPTBIT(INP_RECVDSTADDR);
2611                                 break;
2612
2613                         case IP_RECVIF:
2614                                 optval = OPTBIT(INP_RECVIF);
2615                                 break;
2616
2617                         case IP_RECVTTL:
2618                                 optval = OPTBIT(INP_RECVTTL);
2619                                 break;
2620
2621                         case IP_PORTRANGE:
2622                                 if (inp->inp_flags & INP_HIGHPORT)
2623                                         optval = IP_PORTRANGE_HIGH;
2624                                 else if (inp->inp_flags & INP_LOWPORT)
2625                                         optval = IP_PORTRANGE_LOW;
2626                                 else
2627                                         optval = 0;
2628                                 break;
2629
2630 #if defined(NFAITH) && NFAITH > 0
2631                         case IP_FAITH:
2632                                 optval = OPTBIT(INP_FAITH);
2633                                 break;
2634 #endif
2635                         case IP_RECVPKTINFO:
2636                                 optval = OPTBIT(INP_PKTINFO);
2637                                 break;
2638                         }
2639                         error = sooptcopyout(sopt, &optval, sizeof optval);
2640                         break;
2641
2642                 case IP_MULTICAST_IF:
2643                 case IP_MULTICAST_IFINDEX:
2644                 case IP_MULTICAST_VIF:
2645                 case IP_MULTICAST_TTL:
2646                 case IP_MULTICAST_LOOP:
2647                 case IP_MSFILTER:
2648                         error = inp_getmoptions(inp, sopt);
2649                         break;
2650
2651 #if IPSEC
2652                 case IP_IPSEC_POLICY:
2653                 {
2654                         struct mbuf *m = NULL;
2655                         caddr_t req = NULL;
2656                         size_t len = 0;
2657
2658                         if (m != 0) {
2659                                 req = mtod(m, caddr_t);
2660                                 len = m->m_len;
2661                         }
2662                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
2663                         if (error == 0)
2664                                 error = soopt_mcopyout(sopt, m); /* XXX */
2665                         if (error == 0)
2666                                 m_freem(m);
2667                         break;
2668                 }
2669 #endif /*IPSEC*/
2670
2671 #if TRAFFIC_MGT
2672                 case IP_TRAFFIC_MGT_BACKGROUND:
2673                 {
2674                         unsigned        background = (so->so_traffic_mgt_flags &  TRAFFIC_MGT_SO_BACKGROUND);
2675                         return (sooptcopyout(sopt, &background, sizeof(background)));
2676                         break;
2677                 }
2678 #endif /* TRAFFIC_MGT */
2679
2680                 case IP_BOUND_IF:
2681                         if (inp->inp_flags & INP_BOUND_IF)
2682                                 optval = inp->inp_boundif;
2683                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2684                         break;
2685
2686                 case IP_NO_IFT_CELLULAR:
2687                         optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
2688                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2689                         break;
2690
2691                 case IP_OUT_IF:
2692                         optval = inp->inp_last_outif;
2693                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2694                         break;
2695
2696                 default:
2697                         error = ENOPROTOOPT;
2698                         break;
2699                 }
2700                 break;
2701         }
2702         return (error);
2703 }
2704
2705 /*
2706  * Set up IP options in pcb for insertion in output packets.
2707  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2708  * with destination address if source routed.
2709  */
2710 static int
2711 ip_pcbopts(
2712         __unused int optname,
2713         struct mbuf **pcbopt,
2714         register struct mbuf *m)
2715 {
2716         register int cnt, optlen;
2717         register u_char *cp;
2718         u_char opt;
2719
2720         /* turn off any old options */
2721         if (*pcbopt)
2722                 (void)m_free(*pcbopt);
2723         *pcbopt = 0;
2724         if (m == (struct mbuf *)0 || m->m_len == 0) {
2725                 /*
2726                  * Only turning off any previous options.
2727                  */
2728                 if (m)
2729                         (void)m_free(m);
2730                 return (0);
2731         }
2732
2733 #ifndef vax
2734         if (m->m_len % sizeof(int32_t))
2735                 goto bad;
2736 #endif
2737         /*
2738          * IP first-hop destination address will be stored before
2739          * actual options; move other options back
2740          * and clear it when none present.
2741          */
2742         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
2743                 goto bad;
2744         cnt = m->m_len;
2745         m->m_len += sizeof(struct in_addr);
2746         cp = mtod(m, u_char *) + sizeof(struct in_addr);
2747         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2748         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2749
2750         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2751                 opt = cp[IPOPT_OPTVAL];
2752                 if (opt == IPOPT_EOL)
2753                         break;
2754                 if (opt == IPOPT_NOP)
2755                         optlen = 1;
2756                 else {
2757                         if (cnt < IPOPT_OLEN + sizeof(*cp))
2758                                 goto bad;
2759                         optlen = cp[IPOPT_OLEN];
2760                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2761                                 goto bad;
2762                 }
2763                 switch (opt) {
2764
2765                 default:
2766                         break;
2767
2768                 case IPOPT_LSRR:
2769                 case IPOPT_SSRR:
2770                         /*
2771                          * user process specifies route as:
2772                          *      ->A->B->C->D
2773                          * D must be our final destination (but we can't
2774                          * check that since we may not have connected yet).
2775                          * A is first hop destination, which doesn't appear in
2776                          * actual IP option, but is stored before the options.
2777                          */
2778                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
2779                                 goto bad;
2780                         m->m_len -= sizeof(struct in_addr);
2781                         cnt -= sizeof(struct in_addr);
2782                         optlen -= sizeof(struct in_addr);
2783                         cp[IPOPT_OLEN] = optlen;
2784                         /*
2785                          * Move first hop before start of options.
2786                          */
2787                         bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2788                             sizeof(struct in_addr));
2789                         /*
2790                          * Then copy rest of options back
2791                          * to close up the deleted entry.
2792                          */
2793                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2794                             sizeof(struct in_addr)),
2795                             (caddr_t)&cp[IPOPT_OFFSET+1],
2796                             (unsigned)cnt + sizeof(struct in_addr));
2797                         break;
2798                 }
2799         }
2800         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
2801                 goto bad;
2802         *pcbopt = m;
2803         return (0);
2804
2805 bad:
2806         (void)m_free(m);
2807         return (EINVAL);
2808 }
2809
2810 void
2811 ip_moptions_init(void)
2812 {
2813         PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug));
2814
2815         imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) :
2816             sizeof (struct ip_moptions_dbg);
2817
2818         imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
2819             IMO_ZONE_NAME);
2820         if (imo_zone == NULL) {
2821                 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
2822                 /* NOTREACHED */
2823         }
2824         zone_change(imo_zone, Z_EXPAND, TRUE);
2825 }
2826
2827 void
2828 imo_addref(struct ip_moptions *imo, int locked)
2829 {
2830         if (!locked)
2831                 IMO_LOCK(imo);
2832         else
2833                 IMO_LOCK_ASSERT_HELD(imo);
2834
2835         if (++imo->imo_refcnt == 0) {
2836                 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
2837                 /* NOTREACHED */
2838         } else if (imo->imo_trace != NULL) {
2839                 (*imo->imo_trace)(imo, TRUE);
2840         }
2841
2842         if (!locked)
2843                 IMO_UNLOCK(imo);
2844 }
2845
2846 void
2847 imo_remref(struct ip_moptions *imo)
2848 {
2849         int i;
2850
2851         IMO_LOCK(imo);
2852         if (imo->imo_refcnt == 0) {
2853                 panic("%s: imo %p negative refcnt", __func__, imo);
2854                 /* NOTREACHED */
2855         } else if (imo->imo_trace != NULL) {
2856                 (*imo->imo_trace)(imo, FALSE);
2857         }
2858
2859         --imo->imo_refcnt;
2860         if (imo->imo_refcnt > 0) {
2861                 IMO_UNLOCK(imo);
2862                 return;
2863         }
2864
2865         for (i = 0; i < imo->imo_num_memberships; ++i) {
2866                 struct in_mfilter *imf;
2867
2868                 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
2869                 if (imf != NULL)
2870                         imf_leave(imf);
2871
2872                 (void) in_leavegroup(imo->imo_membership[i], imf);
2873
2874                 if (imf != NULL)
2875                         imf_purge(imf);
2876
2877                 INM_REMREF(imo->imo_membership[i]);
2878                 imo->imo_membership[i] = NULL;
2879         }
2880         imo->imo_num_memberships = 0;
2881         if (imo->imo_mfilters != NULL) {
2882                 FREE(imo->imo_mfilters, M_INMFILTER);
2883                 imo->imo_mfilters = NULL;
2884         }
2885         if (imo->imo_membership != NULL) {
2886                 FREE(imo->imo_membership, M_IPMOPTS);
2887                 imo->imo_membership = NULL;
2888         }
2889         IMO_UNLOCK(imo);
2890
2891         lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
2892
2893         if (!(imo->imo_debug & IFD_ALLOC)) {
2894                 panic("%s: imo %p cannot be freed", __func__, imo);
2895                 /* NOTREACHED */
2896         }
2897         zfree(imo_zone, imo);
2898 }
2899
2900 static void
2901 imo_trace(struct ip_moptions *imo, int refhold)
2902 {
2903         struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2904         ctrace_t *tr;
2905         u_int32_t idx;
2906         u_int16_t *cnt;
2907
2908         if (!(imo->imo_debug & IFD_DEBUG)) {
2909                 panic("%s: imo %p has no debug structure", __func__, imo);
2910                 /* NOTREACHED */
2911         }
2912         if (refhold) {
2913                 cnt = &imo_dbg->imo_refhold_cnt;
2914                 tr = imo_dbg->imo_refhold;
2915         } else {
2916                 cnt = &imo_dbg->imo_refrele_cnt;
2917                 tr = imo_dbg->imo_refrele;
2918         }
2919
2920         idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
2921         ctrace_record(&tr[idx]);
2922 }
2923
2924 struct ip_moptions *
2925 ip_allocmoptions(int how)
2926 {
2927         struct ip_moptions *imo;
2928
2929         imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
2930         if (imo != NULL) {
2931                 bzero(imo, imo_size);
2932                 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
2933                 imo->imo_debug |= IFD_ALLOC;
2934                 if (imo_debug != 0) {
2935                         imo->imo_debug |= IFD_DEBUG;
2936                         imo->imo_trace = imo_trace;
2937                 }
2938                 IMO_ADDREF(imo);
2939         }
2940
2941         return (imo);
2942 }
2943
2944 /*
2945  * Routine called from ip_output() to loop back a copy of an IP multicast
2946  * packet to the input queue of a specified interface.  Note that this
2947  * calls the output routine of the loopback "driver", but with an interface
2948  * pointer that might NOT be a loopback interface -- evil, but easier than
2949  * replicating that code here.
2950  */
2951 static void
2952 ip_mloopback(ifp, m, dst, hlen)
2953         struct ifnet *ifp;
2954         register struct mbuf *m;
2955         register struct sockaddr_in *dst;
2956         int hlen;
2957 {
2958         register struct ip *ip;
2959         struct mbuf *copym;
2960         int sw_csum = (apple_hwcksum_tx == 0);
2961
2962         copym = m_copy(m, 0, M_COPYALL);
2963         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2964                 copym = m_pullup(copym, hlen);
2965
2966         if (copym == NULL)
2967                 return;
2968
2969         /*
2970          * We don't bother to fragment if the IP length is greater
2971          * than the interface's MTU.  Can this possibly matter?
2972          */
2973         ip = mtod(copym, struct ip *);
2974
2975 #if BYTE_ORDER != BIG_ENDIAN
2976         HTONS(ip->ip_len);
2977         HTONS(ip->ip_off);
2978 #endif
2979
2980         ip->ip_sum = 0;
2981         ip->ip_sum = in_cksum(copym, hlen);
2982         /*
2983          * NB:
2984          * It's not clear whether there are any lingering
2985          * reentrancy problems in other areas which might
2986          * be exposed by using ip_input directly (in
2987          * particular, everything which modifies the packet
2988          * in-place).  Yet another option is using the
2989          * protosw directly to deliver the looped back
2990          * packet.  For the moment, we'll err on the side
2991          * of safety by using if_simloop().
2992          */
2993 #if 1 /* XXX */
2994         if (dst->sin_family != AF_INET) {
2995                 printf("ip_mloopback: bad address family %d\n",
2996                                         dst->sin_family);
2997                 dst->sin_family = AF_INET;
2998         }
2999 #endif
3000
3001         /*
3002          * Mark checksum as valid or calculate checksum for loopback.
3003          *
3004          * This is done this way because we have to embed the ifp of
3005          * the interface we will send the original copy of the packet
3006          * out on in the mbuf. ip_input will check if_hwassist of the
3007          * embedded ifp and ignore all csum_flags if if_hwassist is 0.
3008          * The UDP checksum has not been calculated yet.
3009          */
3010         if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) {
3011                 if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) {
3012                         copym->m_pkthdr.csum_flags |=
3013                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3014                             CSUM_IP_CHECKED | CSUM_IP_VALID;
3015                         copym->m_pkthdr.csum_data = 0xffff;
3016                 } else {
3017
3018 #if BYTE_ORDER != BIG_ENDIAN
3019                         NTOHS(ip->ip_len);
3020 #endif
3021
3022                         in_delayed_cksum(copym);
3023
3024 #if BYTE_ORDER != BIG_ENDIAN
3025                         HTONS(ip->ip_len);
3026 #endif
3027
3028                 }
3029         }
3030
3031         /*
3032          * TedW:
3033          * We need to send all loopback traffic down to dlil in case
3034          * a filter has tapped-in.
3035          */
3036
3037         /*
3038          * Stuff the 'real' ifp into the pkthdr, to be used in matching
3039          *  in ip_input(); we need the loopback ifp/dl_tag passed as args
3040          *  to make the loopback driver compliant with the data link
3041          *  requirements.
3042          */
3043         if (lo_ifp) {
3044                 copym->m_pkthdr.rcvif = ifp;
3045                 dlil_output(lo_ifp, PF_INET, copym, 0,
3046                     (struct sockaddr *) dst, 0);
3047         } else {
3048                 printf("Warning: ip_output call to dlil_find_dltag failed!\n");
3049                 m_freem(copym);
3050         }
3051 }
3052
3053 /*
3054  * Given a source IP address (and route, if available), determine the best
3055  * interface to send the packet from.  Checking for (and updating) the
3056  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3057  * without any locks based on the assumption that ip_output() is single-
3058  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3059  * performing output at the IP layer.
3060  *
3061  * This routine is analogous to in6_selectroute() for IPv6.
3062  */
3063 static struct ifaddr *
3064 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3065 {
3066         struct ifaddr *ifa = NULL;
3067         struct in_addr src = ip->ip_src;
3068         struct in_addr dst = ip->ip_dst;
3069         struct ifnet *rt_ifp;
3070         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3071
3072         if (ip_select_srcif_debug) {
3073                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3074                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3075         }
3076
3077         if (ro->ro_rt != NULL)
3078                 RT_LOCK(ro->ro_rt);
3079
3080         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3081
3082         /*
3083          * Given the source IP address, find a suitable source interface
3084          * to use for transmission; if the caller has specified a scope,
3085          * optimize the search by looking at the addresses only for that
3086          * interface.  This is still suboptimal, however, as we need to
3087          * traverse the per-interface list.
3088          */
3089         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3090                 unsigned int scope = ifscope;
3091
3092                 /*
3093                  * If no scope is specified and the route is stale (pointing
3094                  * to a defunct interface) use the current primary interface;
3095                  * this happens when switching between interfaces configured
3096                  * with the same IP address.  Otherwise pick up the scope
3097                  * information from the route; the ULP may have looked up a
3098                  * correct route and we just need to verify it here and mark
3099                  * it with the ROF_SRCIF_SELECTED flag below.
3100                  */
3101                 if (scope == IFSCOPE_NONE) {
3102                         scope = rt_ifp->if_index;
3103                         if (scope != get_primary_ifscope(AF_INET) &&
3104                             ro->ro_rt->generation_id != route_generation)
3105                                 scope = get_primary_ifscope(AF_INET);
3106                 }
3107
3108                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3109
3110                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3111                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3112                         /*
3113                          * If forwarding is enabled, and if the packet isn't
3114                          * TCP or UDP, check if the source address belongs
3115                          * to one of our own interfaces; if so, demote the
3116                          * interface scope and do a route lookup right below.
3117                          */
3118                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3119                         if (ifa != NULL) {
3120                                 IFA_REMREF(ifa);
3121                                 ifa = NULL;
3122                                 ifscope = IFSCOPE_NONE;
3123                         }
3124                 }
3125
3126                 if (ip_select_srcif_debug && ifa != NULL) {
3127                         if (ro->ro_rt != NULL) {
3128                                 printf("%s->%s ifscope %d->%d ifa_if %s "
3129                                     "ro_if %s\n", s_src, s_dst, ifscope,
3130                                     scope, if_name(ifa->ifa_ifp),
3131                                     if_name(rt_ifp));
3132                         } else {
3133                                 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3134                                     s_src, s_dst, ifscope, scope,
3135                                     if_name(ifa->ifa_ifp));
3136                         }
3137                 }
3138         }
3139
3140         /*
3141          * Slow path; search for an interface having the corresponding source
3142          * IP address if the scope was not specified by the caller, and:
3143          *
3144          *   1) There currently isn't any route, or,
3145          *   2) The interface used by the route does not own that source
3146          *      IP address; in this case, the route will get blown away
3147          *      and we'll do a more specific scoped search using the newly
3148          *      found interface.
3149          */
3150         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3151                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3152
3153                 /*
3154                  * If we have the IP address, but not the route, we don't
3155                  * really know whether or not it belongs to the correct
3156                  * interface (it could be shared across multiple interfaces.)
3157                  * The only way to find out is to do a route lookup.
3158                  */
3159                 if (ifa != NULL && ro->ro_rt == NULL) {
3160                         struct rtentry *rt;
3161                         struct sockaddr_in sin;
3162                         struct ifaddr *oifa = NULL;
3163
3164                         bzero(&sin, sizeof (sin));
3165                         sin.sin_family = AF_INET;
3166                         sin.sin_len = sizeof (sin);
3167                         sin.sin_addr = dst;
3168
3169                         lck_mtx_lock(rnh_lock);
3170                         if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL,
3171                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3172                                 RT_LOCK(rt);
3173                                 /*
3174                                  * If the route uses a different interface,
3175                                  * use that one instead.  The IP address of
3176                                  * the ifaddr that we pick up here is not
3177                                  * relevant.
3178                                  */
3179                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3180                                         oifa = ifa;
3181                                         ifa = rt->rt_ifa;
3182                                         IFA_ADDREF(ifa);
3183                                         RT_UNLOCK(rt);
3184                                 } else {
3185                                         RT_UNLOCK(rt);
3186                                 }
3187                                 rtfree_locked(rt);
3188                         }
3189                         lck_mtx_unlock(rnh_lock);
3190
3191                         if (oifa != NULL) {
3192                                 struct ifaddr *iifa;
3193
3194                                 /*
3195                                  * See if the interface pointed to by the
3196                                  * route is configured with the source IP
3197                                  * address of the packet.
3198                                  */
3199                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3200                                     src.s_addr, ifa->ifa_ifp->if_index);
3201
3202                                 if (iifa != NULL) {
3203                                         /*
3204                                          * Found it; drop the original one
3205                                          * as well as the route interface
3206                                          * address, and use this instead.
3207                                          */
3208                                         IFA_REMREF(oifa);
3209                                         IFA_REMREF(ifa);
3210                                         ifa = iifa;
3211                                 } else if (!ipforwarding ||
3212                                     (rt->rt_flags & RTF_GATEWAY)) {
3213                                         /*
3214                                          * This interface doesn't have that
3215                                          * source IP address; drop the route
3216                                          * interface address and just use the
3217                                          * original one, and let the caller
3218                                          * do a scoped route lookup.
3219                                          */
3220                                         IFA_REMREF(ifa);
3221                                         ifa = oifa;
3222                                 } else {
3223                                         /*
3224                                          * Forwarding is enabled and the source
3225                                          * address belongs to one of our own
3226                                          * interfaces which isn't the outgoing
3227                                          * interface, and we have a route, and
3228                                          * the destination is on a network that
3229                                          * is directly attached (onlink); drop
3230                                          * the original one and use the route
3231                                          * interface address instead.
3232                                          */
3233                                         IFA_REMREF(oifa);
3234                                 }
3235                         }
3236                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3237                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3238                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3239                         /*
3240                          * Forwarding is enabled and the source address belongs
3241                          * to one of our own interfaces which isn't the same
3242                          * as the interface used by the known route; drop the
3243                          * original one and use the route interface address.
3244                          */
3245                         IFA_REMREF(ifa);
3246                         ifa = ro->ro_rt->rt_ifa;
3247                         IFA_ADDREF(ifa);
3248                 }
3249
3250                 if (ip_select_srcif_debug && ifa != NULL) {
3251                         printf("%s->%s ifscope %d ifa_if %s\n",
3252                             s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3253                 }
3254         }
3255
3256         if (ro->ro_rt != NULL)
3257                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3258         /*
3259          * If there is a non-loopback route with the wrong interface, or if
3260          * there is no interface configured with such an address, blow it
3261          * away.  Except for local/loopback, we look for one with a matching
3262          * interface scope/index.
3263          */
3264         if (ro->ro_rt != NULL &&
3265             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3266             !(ro->ro_rt->rt_flags & RTF_UP))) {
3267                 if (ip_select_srcif_debug) {
3268                         if (ifa != NULL) {
3269                                 printf("%s->%s ifscope %d ro_if %s != "
3270                                     "ifa_if %s (cached route cleared)\n",
3271                                     s_src, s_dst, ifscope, if_name(rt_ifp),
3272                                     if_name(ifa->ifa_ifp));
3273                         } else {
3274                                 printf("%s->%s ifscope %d ro_if %s "
3275                                     "(no ifa_if found)\n",
3276                                     s_src, s_dst, ifscope, if_name(rt_ifp));
3277                         }
3278                 }
3279
3280                 RT_UNLOCK(ro->ro_rt);
3281                 rtfree(ro->ro_rt);
3282                 ro->ro_rt = NULL;
3283                 ro->ro_flags &= ~ROF_SRCIF_SELECTED;
3284
3285                 /*
3286                  * If the destination is IPv4 LLA and the route's interface
3287                  * doesn't match the source interface, then the source IP
3288                  * address is wrong; it most likely belongs to the primary
3289                  * interface associated with the IPv4 LL subnet.  Drop the
3290                  * packet rather than letting it go out and return an error
3291                  * to the ULP.  This actually applies not only to IPv4 LL
3292                  * but other shared subnets; for now we explicitly test only
3293                  * for the former case and save the latter for future.
3294                  */
3295                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3296                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3297                         IFA_REMREF(ifa);
3298                         ifa = NULL;
3299                 }
3300         }
3301
3302         if (ip_select_srcif_debug && ifa == NULL) {
3303                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3304                     s_src, s_dst, ifscope);
3305         }
3306
3307         /*
3308          * If there is a route, mark it accordingly.  If there isn't one,
3309          * we'll get here again during the next transmit (possibly with a
3310          * route) and the flag will get set at that point.  For IPv4 LLA
3311          * destination, mark it only if the route has been fully resolved;
3312          * otherwise we want to come back here again when the route points
3313          * to the interface over which the ARP reply arrives on.
3314          */
3315         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3316             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3317             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3318                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3319                 ro->ro_rt->generation_id = route_generation;
3320         }
3321
3322         if (ro->ro_rt != NULL)
3323                 RT_UNLOCK(ro->ro_rt);
3324
3325         return (ifa);
3326 }