bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/kernel.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <kern/locks.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/mcache.h>
  83
  84 #include <machine/endian.h>
  85 #include <pexpert/pexpert.h>
  86
  87 #include <net/if.h>
  88 #include <net/if_dl.h>
  89 #include <net/if_types.h>
  90 #include <net/route.h>
  91 #include <net/ntstat.h>
  92 #include <net/net_osdep.h>
  93
  94 #include <netinet/in.h>
  95 #include <netinet/in_systm.h>
  96 #include <netinet/ip.h>
  97 #include <netinet/in_pcb.h>
  98 #include <netinet/in_var.h>
  99 #include <netinet/ip_var.h>
 100
 101 #include <netinet/kpi_ipfilter_var.h>
 102
 103 #if CONFIG_MACF_NET
 104 #include <security/mac_framework.h>
 105 #endif
 106
 107 #include "faith.h"
 108
 109 #include <net/dlil.h>
 110 #include <sys/kdebug.h>
 111 #include <libkern/OSAtomic.h>
 112
 113 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 114 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 115 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 116 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 117
 118 #define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
 119
 120 #if IPSEC
 121 #include <netinet6/ipsec.h>
 122 #include <netkey/key.h>
 123 #if IPSEC_DEBUG
 124 #include <netkey/key_debug.h>
 125 #else
 126 #define KEYDEBUG(lev,arg)
 127 #endif
 128 #endif /*IPSEC*/
 129
 130 #include <netinet/ip_fw.h>
 131 #include <netinet/ip_divert.h>
 132 #include <mach/sdt.h>
 133
 134 #if DUMMYNET
 135 #include <netinet/ip_dummynet.h>
 136 #endif
 137
 138 #if PF
 139 #include <net/pfvar.h>
 140 #endif /* PF */
 141
 142 #if IPFIREWALL_FORWARD_DEBUG
 143 #define print_ip(a)      printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
 144                                                   (ntohl(a.s_addr)>>16)&0xFF,\
 145                                                   (ntohl(a.s_addr)>>8)&0xFF,\
 146                                                   (ntohl(a.s_addr))&0xFF);
 147 #endif
 148
 149
 150 u_short ip_id;
 151
 152 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 153 static void     ip_mloopback(struct ifnet *, struct mbuf *,
 154         struct sockaddr_in *, int);
 155 static int      ip_pcbopts(int, struct mbuf **, struct mbuf *);
 156 static void     imo_trace(struct ip_moptions *, int);
 157
 158 static void ip_out_cksum_stats(int, u_int32_t);
 159 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 160
 161 int     ip_optcopy(struct ip *, struct ip *);
 162 void in_delayed_cksum_offset(struct mbuf *, int );
 163 void in_cksum_offset(struct mbuf* , size_t );
 164
 165 extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **);
 166
 167 extern  struct protosw inetsw[];
 168
 169 extern struct ip_linklocal_stat ip_linklocal_stat;
 170 extern lck_mtx_t *ip_mutex;
 171
 172 /* temporary: for testing */
 173 #if IPSEC
 174 extern int ipsec_bypass;
 175 #endif
 176
 177 static int      ip_maxchainsent = 0;
 178 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED,
 179     &ip_maxchainsent, 0, "use dlil_output_list");
 180 #if DEBUG
 181 static int forge_ce = 0;
 182 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED,
 183     &forge_ce, 0, "Forge ECN CE");
 184 #endif /* DEBUG */
 185
 186 static int ip_select_srcif_debug = 0;
 187 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 188     &ip_select_srcif_debug, 0, "log source interface selection debug info");
 189
 190 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 191
 192 /* For gdb */
 193 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
 194
 195 struct ip_moptions_dbg {
 196         struct ip_moptions      imo;                    /* ip_moptions */
 197         u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
 198         u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
 199         /*
 200          * Alloc and free callers.
 201          */
 202         ctrace_t                imo_alloc;
 203         ctrace_t                imo_free;
 204         /*
 205          * Circular lists of IMO_ADDREF and IMO_REMREF callers.
 206          */
 207         ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
 208         ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
 209 };
 210
 211 #if DEBUG
 212 static unsigned int imo_debug = 1;      /* debugging (enabled) */
 213 #else
 214 static unsigned int imo_debug;          /* debugging (disabled) */
 215 #endif /* !DEBUG */
 216 static unsigned int imo_size;           /* size of zone element */
 217 static struct zone *imo_zone;           /* zone for ip_moptions */
 218
 219 #define IMO_ZONE_MAX            64              /* maximum elements in zone */
 220 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
 221
 222 /*
 223  * IP output.  The packet in mbuf chain m contains a skeletal IP
 224  * header (with len, off, ttl, proto, tos, src, dst).
 225  * The mbuf chain containing the packet will be freed.
 226  * The mbuf opt, if present, will not be freed.
 227  */
 228 int
 229 ip_output(
 230         struct mbuf *m0,
 231         struct mbuf *opt,
 232         struct route *ro,
 233         int flags,
 234         struct ip_moptions *imo,
 235         struct ip_out_args *ipoa)
 236 {
 237         int error;
 238         error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
 239         return error;
 240 }
 241
 242 /*
 243  * Returns:     0                       Success
 244  *              ENOMEM
 245  *              EADDRNOTAVAIL
 246  *              ENETUNREACH
 247  *              EHOSTUNREACH
 248  *              EACCES
 249  *              EMSGSIZE
 250  *              ENOBUFS
 251  *      ipsec4_getpolicybyaddr:???      [IPSEC 4th argument, contents modified]
 252  *      ipsec4_getpolicybysock:???      [IPSEC 4th argument, contents modified]
 253  *      key_spdacquire:???              [IPSEC]
 254  *      ipsec4_output:???               [IPSEC]
 255  *      <fr_checkp>:???                 [firewall]
 256  *      ip_dn_io_ptr:???                [dummynet]
 257  *      dlil_output:???                 [DLIL]
 258  *      dlil_output_list:???            [DLIL]
 259  *
 260  * Notes:       The ipsec4_getpolicyby{addr|sock} function error returns are
 261  *              only used as the error return from this function where one of
 262  *              these functions fails to return a policy.
 263  */
 264 int
 265 ip_output_list(
 266         struct mbuf *m0,
 267         int packetchain,
 268         struct mbuf *opt,
 269         struct route *ro,
 270         int flags,
 271         struct ip_moptions *imo,
 272         struct ip_out_args *ipoa
 273         )
 274 {
 275         struct ip *ip;
 276         struct ifnet *ifp = NULL;
 277         struct mbuf *m = m0, **mppn = NULL;
 278         int hlen = sizeof (struct ip);
 279         int len = 0, error = 0;
 280         struct sockaddr_in *dst = NULL;
 281         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 282         int isbroadcast, sw_csum;
 283         struct in_addr pkt_dst;
 284         struct ipf_pktopts *ippo = NULL, ipf_pktopts;
 285 #if IPSEC
 286         struct ipsec_output_state ipsec_state;
 287         struct route *ipsec_saved_route = NULL;
 288         struct socket *so = NULL;
 289         struct secpolicy *sp = NULL;
 290 #endif
 291 #if IPFIREWALL_FORWARD
 292         int fwd_rewrite_src = 0;
 293 #endif
 294 #if IPFIREWALL
 295         int off;
 296         struct ip_fw_args args;
 297         struct m_tag    *tag;
 298         struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
 299 #endif
 300         int didfilter = 0;
 301         ipfilter_t inject_filter_ref = 0;
 302 #if DUMMYNET
 303         struct route    saved_route;
 304         struct ip_out_args saved_ipoa;
 305         struct sockaddr_in dst_buf;
 306 #endif /* DUMMYNET */
 307         struct mbuf * packetlist;
 308         int pktcnt = 0, tso = 0;
 309         u_int32_t       bytecnt = 0;
 310         unsigned int ifscope;
 311         unsigned int nocell;
 312         boolean_t select_srcif;
 313         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 314
 315 #if IPSEC
 316         bzero(&ipsec_state, sizeof(ipsec_state));
 317 #endif /* IPSEC */
 318
 319         packetlist = m0;
 320 #if IPFIREWALL
 321         args.next_hop = NULL;
 322         args.eh = NULL;
 323         args.rule = NULL;
 324         args.divert_rule = 0;                   /* divert cookie */
 325         args.ipoa = NULL;
 326
 327         if (SLIST_EMPTY(&m0->m_pkthdr.tags))
 328                 goto ipfw_tags_done;
 329
 330         /* Grab info from mtags prepended to the chain */
 331 #if DUMMYNET
 332         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 333             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 334                 struct dn_pkt_tag       *dn_tag;
 335
 336                 dn_tag = (struct dn_pkt_tag *)(tag+1);
 337                 args.rule = dn_tag->rule;
 338                 opt = NULL;
 339                 saved_route = dn_tag->ro;
 340                 ro = &saved_route;
 341
 342                 imo = NULL;
 343                 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
 344                 dst = &dst_buf;
 345                 ifp = dn_tag->ifp;
 346                 flags = dn_tag->flags;
 347                 saved_ipoa = dn_tag->ipoa;
 348                 ipoa = &saved_ipoa;
 349
 350                 m_tag_delete(m0, tag);
 351         }
 352 #endif /* DUMMYNET */
 353
 354 #if IPDIVERT
 355         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 356             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
 357                 struct divert_tag       *div_tag;
 358
 359                 div_tag = (struct divert_tag *)(tag+1);
 360                 args.divert_rule = div_tag->cookie;
 361
 362                 m_tag_delete(m0, tag);
 363         }
 364 #endif /* IPDIVERT */
 365
 366         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 367             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
 368                 struct ip_fwd_tag       *ipfwd_tag;
 369
 370                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
 371                 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
 372
 373                 m_tag_delete(m0, tag);
 374         }
 375 ipfw_tags_done:
 376 #endif /* IPFIREWALL */
 377
 378         m = m0;
 379
 380 #if     DIAGNOSTIC
 381         if ( !m || (m->m_flags & M_PKTHDR) != 0)
 382                 panic("ip_output no HDR");
 383         if (!ro)
 384                 panic("ip_output no route, proto = %d",
 385                       mtod(m, struct ip *)->ip_p);
 386 #endif
 387
 388         bzero(&ipf_pktopts, sizeof(struct ipf_pktopts));
 389         ippo = &ipf_pktopts;
 390
 391         /*
 392          * At present the IP_OUTARGS flag implies a request for IP to
 393          * perform source interface selection.  In the forwarding case,
 394          * only the ifscope value is used, as source interface selection
 395          * doesn't take place.
 396          */
 397         if (ip_doscopedroute && (flags & IP_OUTARGS)) {
 398                 select_srcif = !(flags & IP_FORWARDING);
 399                 ifscope = ipoa->ipoa_boundif;
 400                 ipf_pktopts.ippo_flags = IPPOF_BOUND_IF;
 401                 ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE);
 402         } else {
 403                 select_srcif = FALSE;
 404                 ifscope = IFSCOPE_NONE;
 405         }
 406
 407         if (flags & IP_OUTARGS) {
 408                 nocell = ipoa->ipoa_nocell;
 409                 if (nocell)
 410                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
 411         } else {
 412                 nocell = 0;
 413         }
 414
 415 #if IPFIREWALL
 416         if (args.rule != NULL) {        /* dummynet already saw us */
 417                 ip = mtod(m, struct ip *);
 418                 hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
 419                 if (ro->ro_rt != NULL) {
 420                         RT_LOCK_SPIN(ro->ro_rt);
 421                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 422                         if (ia) {
 423                                 /* Become a regular mutex */
 424                                 RT_CONVERT_LOCK(ro->ro_rt);
 425                                 IFA_ADDREF(&ia->ia_ifa);
 426                         }
 427                         RT_UNLOCK(ro->ro_rt);
 428                 }
 429 #if IPSEC
 430                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 431                         so = ipsec_getsocket(m);
 432                         (void)ipsec_setsocket(m, NULL);
 433                 }
 434 #endif
 435                 goto sendit;
 436         }
 437 #endif /* IPFIREWALL */
 438
 439 #if IPSEC
 440         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 441                 so = ipsec_getsocket(m);
 442                 (void)ipsec_setsocket(m, NULL);
 443         }
 444 #endif
 445 loopit:
 446         /*
 447          * No need to proccess packet twice if we've
 448          * already seen it
 449          */
 450         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
 451                 inject_filter_ref = ipf_get_inject_filter(m);
 452         else
 453                 inject_filter_ref = 0;
 454
 455         if (opt) {
 456                 m = ip_insertoptions(m, opt, &len);
 457                 hlen = len;
 458         }
 459         ip = mtod(m, struct ip *);
 460 #if IPFIREWALL
 461         /*
 462          * rdar://8542331
 463          *
 464          * When dealing with a packet chain, we need to reset "next_hop" because
 465          * "dst" may have been changed to the gateway address below for the previous
 466          * packet of the chain. This could cause the route to be inavertandly changed
 467          * to the route to the gateway address (instead of the route to the destination).
 468          */
 469         args.next_hop = next_hop_from_ipfwd_tag;
 470         pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
 471 #else
 472         pkt_dst = ip->ip_dst;
 473 #endif
 474
 475         /*
 476          * We must not send if the packet is destined to network zero.
 477          * RFC1122 3.2.1.3 (a) and (b).
 478          */
 479         if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
 480                 error = EHOSTUNREACH;
 481                 goto bad;
 482         }
 483
 484         /*
 485          * Fill in IP header.
 486          */
 487         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 488                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 489                 ip->ip_off &= IP_DF;
 490 #if RANDOM_IP_ID
 491                 ip->ip_id = ip_randomid();
 492 #else
 493                 ip->ip_id = htons(ip_id++);
 494 #endif
 495                 OSAddAtomic(1, &ipstat.ips_localout);
 496         } else {
 497                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 498         }
 499
 500 #if DEBUG
 501         /* For debugging, we let the stack forge congestion */
 502         if (forge_ce != 0 &&
 503                 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 504                  (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 505                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 506                 forge_ce--;
 507         }
 508 #endif /* DEBUG */
 509
 510         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr,
 511                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
 512
 513         dst = (struct sockaddr_in *)&ro->ro_dst;
 514
 515         /*
 516          * If there is a cached route,
 517          * check that it is to the same destination
 518          * and is still up.  If not, free it and try again.
 519          * The address family should also be checked in case of sharing the
 520          * cache with IPv6.
 521          */
 522
 523         if (ro->ro_rt != NULL) {
 524                 if (ro->ro_rt->generation_id != route_generation &&
 525                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) &&
 526                     (ip->ip_src.s_addr != INADDR_ANY)) {
 527                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 528                         if (src_ia == NULL) {
 529                                 error = EADDRNOTAVAIL;
 530                                 goto bad;
 531                         }
 532                         IFA_REMREF(&src_ia->ia_ifa);
 533                 }
 534                 /*
 535                  * Test rt_flags without holding rt_lock for performance
 536                  * reasons; if the route is down it will hopefully be
 537                  * caught by the layer below (since it uses this route
 538                  * as a hint) or during the next transmit.
 539                  */
 540                 if ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 541                     dst->sin_family != AF_INET ||
 542                     dst->sin_addr.s_addr != pkt_dst.s_addr) {
 543                         rtfree(ro->ro_rt);
 544                         ro->ro_rt = NULL;
 545                 }
 546                 /*
 547                  * If we're doing source interface selection, we may not
 548                  * want to use this route; only synch up the generation
 549                  * count otherwise.
 550                  */
 551                 if (!select_srcif && ro->ro_rt != NULL &&
 552                     ro->ro_rt->generation_id != route_generation)
 553                         ro->ro_rt->generation_id = route_generation;
 554         }
 555         if (ro->ro_rt == NULL) {
 556                 bzero(dst, sizeof(*dst));
 557                 dst->sin_family = AF_INET;
 558                 dst->sin_len = sizeof(*dst);
 559                 dst->sin_addr = pkt_dst;
 560         }
 561         /*
 562          * If routing to interface only,
 563          * short circuit routing lookup.
 564          */
 565 #define ifatoia(ifa)    ((struct in_ifaddr *)(ifa))
 566 #define sintosa(sin)    ((struct sockaddr *)(sin))
 567         if (flags & IP_ROUTETOIF) {
 568                 if (ia)
 569                         IFA_REMREF(&ia->ia_ifa);
 570                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) {
 571                         if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
 572                                 OSAddAtomic(1, &ipstat.ips_noroute);
 573                                 error = ENETUNREACH;
 574                                 goto bad;
 575                         }
 576                 }
 577                 ifp = ia->ia_ifp;
 578                 ip->ip_ttl = 1;
 579                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
 580         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 581             imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
 582                 /*
 583                  * Bypass the normal routing lookup for multicast
 584                  * packets if the interface is specified.
 585                  */
 586                 isbroadcast = 0;
 587                 if (ia != NULL)
 588                         IFA_REMREF(&ia->ia_ifa);
 589
 590                 /* Macro takes reference on ia */
 591                 IFP_TO_IA(ifp, ia);
 592         } else {
 593                 boolean_t cloneok = FALSE;
 594                 /*
 595                  * Perform source interface selection; the source IP address
 596                  * must belong to one of the addresses of the interface used
 597                  * by the route.  For performance reasons, do this only if
 598                  * there is no route, or if the routing table has changed,
 599                  * or if we haven't done source interface selection on this
 600                  * route (for this PCB instance) before.
 601                  */
 602                 if (select_srcif && ip->ip_src.s_addr != INADDR_ANY &&
 603                     (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) ||
 604                     ro->ro_rt->generation_id != route_generation ||
 605                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 606                         struct ifaddr *ifa;
 607
 608                         /* Find the source interface */
 609                         ifa = in_selectsrcif(ip, ro, ifscope);
 610
 611                         /*
 612                          * If the source address belongs to a cellular interface
 613                          * and the caller forbids our using interfaces of such
 614                          * type, pretend that there is no source address.
 615                          */
 616                         if (nocell && ifa != NULL &&
 617                             ifa->ifa_ifp->if_type == IFT_CELLULAR) {
 618                                 IFA_REMREF(ifa);
 619                                 error = EADDRNOTAVAIL;
 620                                 goto bad;
 621                         }
 622
 623                         /*
 624                          * If the source address is spoofed (in the case
 625                          * of IP_RAWOUTPUT), or if this is destined for
 626                          * local/loopback, just let it go out using the
 627                          * interface of the route.  Otherwise, there's no
 628                          * interface having such an address, so bail out.
 629                          */
 630                         if (ifa == NULL && !(flags & IP_RAWOUTPUT) &&
 631                             ifscope != lo_ifp->if_index) {
 632                                 error = EADDRNOTAVAIL;
 633                                 goto bad;
 634                         }
 635
 636                         /*
 637                          * If the caller didn't explicitly specify the scope,
 638                          * pick it up from the source interface.  If the cached
 639                          * route was wrong and was blown away as part of source
 640                          * interface selection, don't mask out RTF_PRCLONING
 641                          * since that route may have been allocated by the ULP,
 642                          * unless the IP header was created by the caller or
 643                          * the destination is IPv4 LLA.  The check for the
 644                          * latter is needed because IPv4 LLAs are never scoped
 645                          * in the current implementation, and we don't want to
 646                          * replace the resolved IPv4 LLA route with one whose
 647                          * gateway points to that of the default gateway on
 648                          * the primary interface of the system.
 649                          */
 650                         if (ifa != NULL) {
 651                                 if (ifscope == IFSCOPE_NONE)
 652                                         ifscope = ifa->ifa_ifp->if_index;
 653                                 IFA_REMREF(ifa);
 654                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 655                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 656                         }
 657                 }
 658
 659                 /*
 660                  * If this is the case, we probably don't want to allocate
 661                  * a protocol-cloned route since we didn't get one from the
 662                  * ULP.  This lets TCP do its thing, while not burdening
 663                  * forwarding or ICMP with the overhead of cloning a route.
 664                  * Of course, we still want to do any cloning requested by
 665                  * the link layer, as this is probably required in all cases
 666                  * for correct operation (as it is for ARP).
 667                  */
 668                 if (ro->ro_rt == NULL) {
 669                         unsigned long ign = RTF_PRCLONING;
 670                         /*
 671                          * We make an exception here: if the destination
 672                          * address is INADDR_BROADCAST, allocate a protocol-
 673                          * cloned host route so that we end up with a route
 674                          * marked with the RTF_BROADCAST flag.  Otherwise,
 675                          * we would end up referring to the default route,
 676                          * instead of creating a cloned host route entry.
 677                          * That would introduce inconsistencies between ULPs
 678                          * that allocate a route and those that don't.  The
 679                          * RTF_BROADCAST route is important since we'd want
 680                          * to send out undirected IP broadcast packets using
 681                          * link-level broadcast address. Another exception
 682                          * is for ULP-created routes that got blown away by
 683                          * source interface selection (see above).
 684                          *
 685                          * These exceptions will no longer be necessary when
 686                          * the RTF_PRCLONING scheme is no longer present.
 687                          */
 688                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
 689                                 ign &= ~RTF_PRCLONING;
 690
 691                         /*
 692                          * Loosen the route lookup criteria if the ifscope
 693                          * corresponds to the loopback interface; this is
 694                          * needed to support Application Layer Gateways
 695                          * listening on loopback, in conjunction with packet
 696                          * filter redirection rules.  The final source IP
 697                          * address will be rewritten by the packet filter
 698                          * prior to the RFC1122 loopback check below.
 699                          */
 700                         if (ifscope == lo_ifp->if_index)
 701                                 rtalloc_ign(ro, ign);
 702                         else
 703                                 rtalloc_scoped_ign(ro, ign, ifscope);
 704
 705                         /*
 706                          * If the route points to a cellular interface and the
 707                          * caller forbids our using interfaces of such type,
 708                          * pretend that there is no route.
 709                          */
 710                         if (nocell && ro->ro_rt != NULL) {
 711                                 RT_LOCK_SPIN(ro->ro_rt);
 712                                 if (ro->ro_rt->rt_ifp->if_type ==
 713                                     IFT_CELLULAR) {
 714                                         RT_UNLOCK(ro->ro_rt);
 715                                         rtfree(ro->ro_rt);
 716                                         ro->ro_rt = NULL;
 717                                 } else {
 718                                         RT_UNLOCK(ro->ro_rt);
 719                                 }
 720                         }
 721                 }
 722
 723                 if (ro->ro_rt == NULL) {
 724                         OSAddAtomic(1, &ipstat.ips_noroute);
 725                         error = EHOSTUNREACH;
 726                         goto bad;
 727                 }
 728
 729                 if (ia)
 730                         IFA_REMREF(&ia->ia_ifa);
 731                 RT_LOCK_SPIN(ro->ro_rt);
 732                 ia = ifatoia(ro->ro_rt->rt_ifa);
 733                 if (ia) {
 734                         /* Become a regular mutex */
 735                         RT_CONVERT_LOCK(ro->ro_rt);
 736                         IFA_ADDREF(&ia->ia_ifa);
 737                 }
 738                 ifp = ro->ro_rt->rt_ifp;
 739                 ro->ro_rt->rt_use++;
 740                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 741                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 742                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 743                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 744                 } else {
 745                         /* Become a regular mutex */
 746                         RT_CONVERT_LOCK(ro->ro_rt);
 747                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
 748                 }
 749                 RT_UNLOCK(ro->ro_rt);
 750         }
 751
 752         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 753                 struct in_multi *inm;
 754                 u_int32_t vif;
 755                 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
 756                 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 757
 758                 m->m_flags |= M_MCAST;
 759                 /*
 760                  * IP destination address is multicast.  Make sure "dst"
 761                  * still points to the address in "ro".  (It may have been
 762                  * changed to point to a gateway address, above.)
 763                  */
 764                 dst = (struct sockaddr_in *)&ro->ro_dst;
 765                 /*
 766                  * See if the caller provided any multicast options
 767                  */
 768                 if (imo != NULL) {
 769                         IMO_LOCK(imo);
 770                         vif = imo->imo_multicast_vif;
 771                         ttl = imo->imo_multicast_ttl;
 772                         loop = imo->imo_multicast_loop;
 773                         if ((flags & IP_RAWOUTPUT) == 0)
 774                                 ip->ip_ttl = ttl;
 775                         if (imo->imo_multicast_ifp != NULL)
 776                                 ifp = imo->imo_multicast_ifp;
 777                         IMO_UNLOCK(imo);
 778 #if MROUTING
 779                         if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 ||
 780                             ip->ip_src.s_addr == INADDR_ANY))
 781                                 ip->ip_src.s_addr = ip_mcast_src(vif);
 782 #endif /* MROUTING */
 783                 } else if ((flags & IP_RAWOUTPUT) == 0) {
 784                         vif = -1;
 785                         ip->ip_ttl = ttl;
 786                 }
 787                 /*
 788                  * Confirm that the outgoing interface supports multicast.
 789                  */
 790                 if (imo == NULL || vif == -1) {
 791                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 792                                 OSAddAtomic(1, &ipstat.ips_noroute);
 793                                 error = ENETUNREACH;
 794                                 goto bad;
 795                         }
 796                 }
 797                 /*
 798                  * If source address not specified yet, use address
 799                  * of outgoing interface.
 800                  */
 801                 if (ip->ip_src.s_addr == INADDR_ANY) {
 802                         struct in_ifaddr *ia1;
 803                         lck_rw_lock_shared(in_ifaddr_rwlock);
 804                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
 805                                 IFA_LOCK_SPIN(&ia1->ia_ifa);
 806                                 if (ia1->ia_ifp == ifp) {
 807                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 808                                         IFA_UNLOCK(&ia1->ia_ifa);
 809                                         break;
 810                                 }
 811                                 IFA_UNLOCK(&ia1->ia_ifa);
 812                         }
 813                         lck_rw_done(in_ifaddr_rwlock);
 814                         if (ip->ip_src.s_addr == INADDR_ANY) {
 815                                 error = ENETUNREACH;
 816                                 goto bad;
 817                         }
 818                 }
 819
 820                 in_multihead_lock_shared();
 821                 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
 822                 in_multihead_lock_done();
 823                 if (inm != NULL && (imo == NULL || loop)) {
 824                         /*
 825                          * If we belong to the destination multicast group
 826                          * on the outgoing interface, and the caller did not
 827                          * forbid loopback, loop back a copy.
 828                          */
 829                         if (!TAILQ_EMPTY(&ipv4_filters)) {
 830                                 struct ipfilter *filter;
 831                                 int seen = (inject_filter_ref == 0);
 832
 833                                 if (imo != NULL) {
 834                                         ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS;
 835                                         ipf_pktopts.ippo_mcast_ifnet = ifp;
 836                                         ipf_pktopts.ippo_mcast_ttl = ttl;
 837                                         ipf_pktopts.ippo_mcast_loop = loop;
 838                                 }
 839
 840                                 ipf_ref();
 841
 842                                 /* 4135317 - always pass network byte order to filter */
 843
 844 #if BYTE_ORDER != BIG_ENDIAN
 845                                 HTONS(ip->ip_len);
 846                                 HTONS(ip->ip_off);
 847 #endif
 848
 849                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 850                                         if (seen == 0) {
 851                                                 if ((struct ipfilter *)inject_filter_ref == filter)
 852                                                         seen = 1;
 853                                         } else if (filter->ipf_filter.ipf_output) {
 854                                                 errno_t result;
 855                                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
 856                                                 if (result == EJUSTRETURN) {
 857                                                         ipf_unref();
 858                                                         INM_REMREF(inm);
 859                                                         goto done;
 860                                                 }
 861                                                 if (result != 0) {
 862                                                         ipf_unref();
 863                                                         INM_REMREF(inm);
 864                                                         goto bad;
 865                                                 }
 866                                         }
 867                                 }
 868
 869                                 /* set back to host byte order */
 870                                 ip = mtod(m, struct ip *);
 871
 872 #if BYTE_ORDER != BIG_ENDIAN
 873                                 NTOHS(ip->ip_len);
 874                                 NTOHS(ip->ip_off);
 875 #endif
 876
 877                                 ipf_unref();
 878                                 didfilter = 1;
 879                         }
 880                         ip_mloopback(ifp, m, dst, hlen);
 881                 }
 882 #if MROUTING
 883                 else {
 884                         /*
 885                          * If we are acting as a multicast router, perform
 886                          * multicast forwarding as if the packet had just
 887                          * arrived on the interface to which we are about
 888                          * to send.  The multicast forwarding function
 889                          * recursively calls this function, using the
 890                          * IP_FORWARDING flag to prevent infinite recursion.
 891                          *
 892                          * Multicasts that are looped back by ip_mloopback(),
 893                          * above, will be forwarded by the ip_input() routine,
 894                          * if necessary.
 895                          */
 896                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
 897                                 /*
 898                                  * Check if rsvp daemon is running. If not, don't
 899                                  * set ip_moptions. This ensures that the packet
 900                                  * is multicast and not just sent down one link
 901                                  * as prescribed by rsvpd.
 902                                  */
 903                                 if (!rsvp_on)
 904                                         imo = NULL;
 905                                 if (ip_mforward(ip, ifp, m, imo) != 0) {
 906                                         m_freem(m);
 907                                         if (inm != NULL)
 908                                                 INM_REMREF(inm);
 909                                         goto done;
 910                                 }
 911                         }
 912                 }
 913 #endif /* MROUTING */
 914                 if (inm != NULL)
 915                         INM_REMREF(inm);
 916                 /*
 917                  * Multicasts with a time-to-live of zero may be looped-
 918                  * back, above, but must not be transmitted on a network.
 919                  * Also, multicasts addressed to the loopback interface
 920                  * are not sent -- the above call to ip_mloopback() will
 921                  * loop back a copy if this host actually belongs to the
 922                  * destination group on the loopback interface.
 923                  */
 924                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 925                         m_freem(m);
 926                         goto done;
 927                 }
 928
 929                 goto sendit;
 930         }
 931 #ifndef notdef
 932         /*
 933          * If source address not specified yet, use address
 934          * of outgoing interface.
 935          */
 936         if (ip->ip_src.s_addr == INADDR_ANY) {
 937                 IFA_LOCK_SPIN(&ia->ia_ifa);
 938                 ip->ip_src = IA_SIN(ia)->sin_addr;
 939                 IFA_UNLOCK(&ia->ia_ifa);
 940 #if IPFIREWALL_FORWARD
 941                 /* Keep note that we did this - if the firewall changes
 942                  * the next-hop, our interface may change, changing the
 943                  * default source IP. It's a shame so much effort happens
 944                  * twice. Oh well.
 945                  */
 946                 fwd_rewrite_src++;
 947 #endif /* IPFIREWALL_FORWARD */
 948         }
 949 #endif /* notdef */
 950
 951         /*
 952          * Look for broadcast address and
 953          * and verify user is allowed to send
 954          * such a packet.
 955          */
 956         if (isbroadcast) {
 957                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 958                         error = EADDRNOTAVAIL;
 959                         goto bad;
 960                 }
 961                 if ((flags & IP_ALLOWBROADCAST) == 0) {
 962                         error = EACCES;
 963                         goto bad;
 964                 }
 965                 /* don't allow broadcast messages to be fragmented */
 966                 if ((u_short)ip->ip_len > ifp->if_mtu) {
 967                         error = EMSGSIZE;
 968                         goto bad;
 969                 }
 970                 m->m_flags |= M_BCAST;
 971         } else {
 972                 m->m_flags &= ~M_BCAST;
 973         }
 974
 975 sendit:
 976 #if PF
 977         /* Invoke outbound packet filter */
 978         if ( PF_IS_ENABLED) {
 979                 int rc;
 980                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE);
 981                 if (rc != 0) {
 982                         if (packetlist == m0) {
 983                                 packetlist = m;
 984                                 mppn = NULL;
 985                         }
 986                         if (m != NULL) {
 987                                 m0 = m;
 988                                 /* Next packet in the chain */
 989                                 goto loopit;
 990                         } else if (packetlist != NULL) {
 991                                 /* No more packet; send down the chain */
 992                                 goto sendchain;
 993                         }
 994                         /* Nothing left; we're done */
 995                         goto done;
 996                 }
 997                 m0 = m;
 998                 ip = mtod(m, struct ip *);
 999                 pkt_dst = ip->ip_dst;
1000                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1001         }
1002 #endif /* PF */
1003         /*
1004          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1005          */
1006         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1007                 ip_linklocal_stat.iplls_out_total++;
1008                 if (ip->ip_ttl != MAXTTL) {
1009                         ip_linklocal_stat.iplls_out_badttl++;
1010                         ip->ip_ttl = MAXTTL;
1011                 }
1012         }
1013
1014         if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
1015                 struct ipfilter *filter;
1016                 int seen = (inject_filter_ref == 0);
1017                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1018
1019                 /* Check that a TSO frame isn't passed to a filter.
1020                  * This could happen if a filter is inserted while
1021                  * TCP is sending the TSO packet.
1022                  */
1023                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1024                         error = EMSGSIZE;
1025                         goto bad;
1026                 }
1027
1028                 ipf_ref();
1029
1030                 /* 4135317 - always pass network byte order to filter */
1031
1032 #if BYTE_ORDER != BIG_ENDIAN
1033                 HTONS(ip->ip_len);
1034                 HTONS(ip->ip_off);
1035 #endif
1036
1037                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1038                         if (seen == 0) {
1039                                 if ((struct ipfilter *)inject_filter_ref == filter)
1040                                         seen = 1;
1041                         } else if (filter->ipf_filter.ipf_output) {
1042                                 errno_t result;
1043                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1044                                 if (result == EJUSTRETURN) {
1045                                         ipf_unref();
1046                                         goto done;
1047                                 }
1048                                 if (result != 0) {
1049                                         ipf_unref();
1050                                         goto bad;
1051                                 }
1052                         }
1053                 }
1054
1055                 /* set back to host byte order */
1056                 ip = mtod(m, struct ip *);
1057
1058 #if BYTE_ORDER != BIG_ENDIAN
1059                 NTOHS(ip->ip_len);
1060                 NTOHS(ip->ip_off);
1061 #endif
1062
1063                 ipf_unref();
1064         }
1065
1066 #if IPSEC
1067         /* temporary for testing only: bypass ipsec alltogether */
1068
1069         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0)
1070                 goto skip_ipsec;
1071
1072         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
1073
1074
1075         /* get SP for this packet */
1076         if (so == NULL)
1077                 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
1078         else
1079                 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
1080
1081         if (sp == NULL) {
1082                 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1083                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1084                 goto bad;
1085         }
1086
1087         error = 0;
1088
1089         /* check policy */
1090         switch (sp->policy) {
1091         case IPSEC_POLICY_DISCARD:
1092         case IPSEC_POLICY_GENERATE:
1093                 /*
1094                  * This packet is just discarded.
1095                  */
1096                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1097                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0);
1098                 goto bad;
1099
1100         case IPSEC_POLICY_BYPASS:
1101         case IPSEC_POLICY_NONE:
1102                 /* no need to do IPsec. */
1103                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0);
1104                 goto skip_ipsec;
1105
1106         case IPSEC_POLICY_IPSEC:
1107                 if (sp->req == NULL) {
1108                         /* acquire a policy */
1109                         error = key_spdacquire(sp);
1110                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0);
1111                         goto bad;
1112                 }
1113                 break;
1114
1115         case IPSEC_POLICY_ENTRUST:
1116         default:
1117                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1118         }
1119     {
1120         ipsec_state.m = m;
1121         if (flags & IP_ROUTETOIF) {
1122                 bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1123         } else
1124                 route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro));
1125         ipsec_state.dst = (struct sockaddr *)dst;
1126
1127         ip->ip_sum = 0;
1128
1129         /*
1130          * XXX
1131          * delayed checksums are not currently compatible with IPsec
1132          */
1133         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1134                 in_delayed_cksum(m);
1135                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1136         }
1137
1138
1139 #if BYTE_ORDER != BIG_ENDIAN
1140         HTONS(ip->ip_len);
1141         HTONS(ip->ip_off);
1142 #endif
1143
1144         DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1145                 struct ip *, ip, struct ifnet *, ifp,
1146                 struct ip *, ip, struct ip6_hdr *, NULL);
1147
1148         error = ipsec4_output(&ipsec_state, sp, flags);
1149
1150         m0 = m = ipsec_state.m;
1151
1152         if (flags & IP_ROUTETOIF) {
1153                 /*
1154                  * if we have tunnel mode SA, we may need to ignore
1155                  * IP_ROUTETOIF.
1156                  */
1157                 if (ipsec_state.tunneled) {
1158                         flags &= ~IP_ROUTETOIF;
1159                         ipsec_saved_route = ro;
1160                         ro = &ipsec_state.ro;
1161                 }
1162         } else {
1163                 ipsec_saved_route = ro;
1164                 ro = &ipsec_state.ro;
1165         }
1166         dst = (struct sockaddr_in *)ipsec_state.dst;
1167         if (error) {
1168                 /* mbuf is already reclaimed in ipsec4_output. */
1169                 m0 = NULL;
1170                 switch (error) {
1171                 case EHOSTUNREACH:
1172                 case ENETUNREACH:
1173                 case EMSGSIZE:
1174                 case ENOBUFS:
1175                 case ENOMEM:
1176                         break;
1177                 default:
1178                         printf("ip4_output (ipsec): error code %d\n", error);
1179                         /*fall through*/
1180                 case ENOENT:
1181                         /* don't show these error codes to the user */
1182                         error = 0;
1183                         break;
1184                 }
1185                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0);
1186                 goto bad;
1187         }
1188     }
1189
1190         /* be sure to update variables that are affected by ipsec4_output() */
1191         ip = mtod(m, struct ip *);
1192
1193 #ifdef _IP_VHL
1194         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1195 #else
1196         hlen = ip->ip_hl << 2;
1197 #endif
1198         /* Check that there wasn't a route change and src is still valid */
1199         if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) {
1200                 if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL &&
1201                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) {
1202                         error = EADDRNOTAVAIL;
1203                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1204                             5,0,0,0,0);
1205                         goto bad;
1206                 }
1207                 rtfree(ro->ro_rt);
1208                 ro->ro_rt = NULL;
1209                 if (src_ia != NULL)
1210                         IFA_REMREF(&src_ia->ia_ifa);
1211         }
1212
1213         if (ro->ro_rt == NULL) {
1214                 if ((flags & IP_ROUTETOIF) == 0) {
1215                         printf("ip_output: can't update route after "
1216                             "IPsec processing\n");
1217                         error = EHOSTUNREACH;   /*XXX*/
1218                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1219                             6,0,0,0,0);
1220                         goto bad;
1221                 }
1222         } else {
1223                 if (ia)
1224                         IFA_REMREF(&ia->ia_ifa);
1225                 RT_LOCK_SPIN(ro->ro_rt);
1226                 ia = ifatoia(ro->ro_rt->rt_ifa);
1227                 if (ia) {
1228                         /* Become a regular mutex */
1229                         RT_CONVERT_LOCK(ro->ro_rt);
1230                         IFA_ADDREF(&ia->ia_ifa);
1231                 }
1232                 ifp = ro->ro_rt->rt_ifp;
1233                 RT_UNLOCK(ro->ro_rt);
1234         }
1235
1236         /* make it flipped, again. */
1237
1238 #if BYTE_ORDER != BIG_ENDIAN
1239         NTOHS(ip->ip_len);
1240         NTOHS(ip->ip_off);
1241 #endif
1242
1243         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff);
1244
1245         /* Pass to filters again */
1246         if (!TAILQ_EMPTY(&ipv4_filters)) {
1247                 struct ipfilter *filter;
1248
1249                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1250
1251                 /* Check that a TSO frame isn't passed to a filter.
1252                  * This could happen if a filter is inserted while
1253                  * TCP is sending the TSO packet.
1254                  */
1255                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1256                         error = EMSGSIZE;
1257                         goto bad;
1258                 }
1259
1260                 ipf_ref();
1261
1262                 /* 4135317 - always pass network byte order to filter */
1263
1264 #if BYTE_ORDER != BIG_ENDIAN
1265                 HTONS(ip->ip_len);
1266                 HTONS(ip->ip_off);
1267 #endif
1268
1269                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1270                         if (filter->ipf_filter.ipf_output) {
1271                                 errno_t result;
1272                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1273                                 if (result == EJUSTRETURN) {
1274                                         ipf_unref();
1275                                         goto done;
1276                                 }
1277                                 if (result != 0) {
1278                                         ipf_unref();
1279                                         goto bad;
1280                                 }
1281                         }
1282                 }
1283
1284                 /* set back to host byte order */
1285                 ip = mtod(m, struct ip *);
1286
1287 #if BYTE_ORDER != BIG_ENDIAN
1288                 NTOHS(ip->ip_len);
1289                 NTOHS(ip->ip_off);
1290 #endif
1291
1292                 ipf_unref();
1293         }
1294 skip_ipsec:
1295 #endif /*IPSEC*/
1296
1297 #if IPFIREWALL
1298         /*
1299          * IpHack's section.
1300          * - Xlate: translate packet's addr/port (NAT).
1301          * - Firewall: deny/allow/etc.
1302          * - Wrap: fake packet's addr/port <unimpl.>
1303          * - Encapsulate: put it in another IP and send out. <unimp.>
1304          */
1305         if (fr_checkp) {
1306                 struct  mbuf    *m1 = m;
1307
1308                 if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) {
1309                         goto done;
1310                 }
1311                 ip = mtod(m0 = m = m1, struct ip *);
1312         }
1313
1314         /*
1315          * Check with the firewall...
1316          * but not if we are already being fwd'd from a firewall.
1317          */
1318         if (fw_enable && IPFW_LOADED && !args.next_hop) {
1319                 struct sockaddr_in *old = dst;
1320
1321                 args.m = m;
1322                 args.next_hop = dst;
1323                 args.oif = ifp;
1324                 off = ip_fw_chk_ptr(&args);
1325                 m = args.m;
1326                 dst = args.next_hop;
1327
1328                 /*
1329                  * On return we must do the following:
1330                  * IP_FW_PORT_DENY_FLAG         -> drop the pkt (XXX new)
1331                  * 1<=off<= 0xffff   -> DIVERT
1332                  * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1333                  * (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
1334                  * dst != old        -> IPFIREWALL_FORWARD
1335                  * off==0, dst==old  -> accept
1336                  * If some of the above modules is not compiled in, then
1337                  * we should't have to check the corresponding condition
1338                  * (because the ipfw control socket should not accept
1339                  * unsupported rules), but better play safe and drop
1340                  * packets in case of doubt.
1341                  */
1342                 m0 = m;
1343                 if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1344                         if (m)
1345                                 m_freem(m);
1346                         error = EACCES ;
1347                         goto done ;
1348                 }
1349                 ip = mtod(m, struct ip *);
1350
1351                 if (off == 0 && dst == old) {/* common case */
1352                         goto pass ;
1353                 }
1354 #if DUMMYNET
1355                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
1356                         /*
1357                          * pass the pkt to dummynet. Need to include
1358                          * pipe number, m, ifp, ro, dst because these are
1359                          * not recomputed in the next pass.
1360                          * All other parameters have been already used and
1361                          * so they are not needed anymore.
1362                          * XXX note: if the ifp or ro entry are deleted
1363                          * while a pkt is in dummynet, we are in trouble!
1364                          */
1365                         args.ro = ro;
1366                         args.dst = dst;
1367                         args.flags = flags;
1368                         if (flags & IP_OUTARGS)
1369                                 args.ipoa = ipoa;
1370
1371                         error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
1372                             &args);
1373                         goto done;
1374                 }
1375 #endif /* DUMMYNET */
1376 #if IPDIVERT
1377                 if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
1378                         struct mbuf *clone = NULL;
1379
1380                         /* Clone packet if we're doing a 'tee' */
1381                         if ((off & IP_FW_PORT_TEE_FLAG) != 0)
1382                                 clone = m_dup(m, M_DONTWAIT);
1383                         /*
1384                          * XXX
1385                          * delayed checksums are not currently compatible
1386                          * with divert sockets.
1387                          */
1388                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1389                                 in_delayed_cksum(m);
1390                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1391                         }
1392
1393                         /* Restore packet header fields to original values */
1394
1395 #if BYTE_ORDER != BIG_ENDIAN
1396                         HTONS(ip->ip_len);
1397                         HTONS(ip->ip_off);
1398 #endif
1399
1400                         /* Deliver packet to divert input routine */
1401                         divert_packet(m, 0, off & 0xffff, args.divert_rule);
1402
1403                         /* If 'tee', continue with original packet */
1404                         if (clone != NULL) {
1405                                 m0 = m = clone;
1406                                 ip = mtod(m, struct ip *);
1407                                 goto pass;
1408                         }
1409                         goto done;
1410                 }
1411 #endif
1412
1413 #if IPFIREWALL_FORWARD
1414                 /* Here we check dst to make sure it's directly reachable on the
1415                  * interface we previously thought it was.
1416                  * If it isn't (which may be likely in some situations) we have
1417                  * to re-route it (ie, find a route for the next-hop and the
1418                  * associated interface) and set them here. This is nested
1419                  * forwarding which in most cases is undesirable, except where
1420                  * such control is nigh impossible. So we do it here.
1421                  * And I'm babbling.
1422                  */
1423                 if (off == 0 && old != dst) {
1424                         struct in_ifaddr *ia_fw;
1425
1426                         /* It's changed... */
1427                         /* There must be a better way to do this next line... */
1428                         static struct route sro_fwd, *ro_fwd = &sro_fwd;
1429 #if IPFIREWALL_FORWARD_DEBUG
1430                         printf("IPFIREWALL_FORWARD: New dst ip: ");
1431                         print_ip(dst->sin_addr);
1432                         printf("\n");
1433 #endif
1434                         /*
1435                          * We need to figure out if we have been forwarded
1436                          * to a local socket. If so then we should somehow
1437                          * "loop back" to ip_input, and get directed to the
1438                          * PCB as if we had received this packet. This is
1439                          * because it may be dificult to identify the packets
1440                          * you want to forward until they are being output
1441                          * and have selected an interface. (e.g. locally
1442                          * initiated packets) If we used the loopback inteface,
1443                          * we would not be able to control what happens
1444                          * as the packet runs through ip_input() as
1445                          * it is done through a ISR.
1446                          */
1447                         lck_rw_lock_shared(in_ifaddr_rwlock);
1448                         TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1449                                 /*
1450                                  * If the addr to forward to is one
1451                                  * of ours, we pretend to
1452                                  * be the destination for this packet.
1453                                  */
1454                                 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1455                                 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1456                                     dst->sin_addr.s_addr) {
1457                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1458                                         break;
1459                                 }
1460                                 IFA_UNLOCK(&ia_fw->ia_ifa);
1461                         }
1462                         lck_rw_done(in_ifaddr_rwlock);
1463                         if (ia_fw) {
1464                                 /* tell ip_input "dont filter" */
1465                                 struct m_tag            *fwd_tag;
1466                                 struct ip_fwd_tag       *ipfwd_tag;
1467
1468                                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1469                                     KERNEL_TAG_TYPE_IPFORWARD,
1470                                     sizeof (*ipfwd_tag), M_NOWAIT, m);
1471                                 if (fwd_tag == NULL) {
1472                                         error = ENOBUFS;
1473                                         goto bad;
1474                                 }
1475
1476                                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1477                                 ipfwd_tag->next_hop = args.next_hop;
1478
1479                                 m_tag_prepend(m, fwd_tag);
1480
1481                                 if (m->m_pkthdr.rcvif == NULL)
1482                                         m->m_pkthdr.rcvif = lo_ifp;
1483                                 if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) &
1484                                                 m->m_pkthdr.csum_flags) == 0) {
1485                                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1486                                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1487                                                 m->m_pkthdr.csum_flags |=
1488                                                         CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1489                                                 m->m_pkthdr.csum_data = 0xffff;
1490                                         }
1491                                         m->m_pkthdr.csum_flags |=
1492                                                 CSUM_IP_CHECKED | CSUM_IP_VALID;
1493                                 }
1494                                 else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1495                                         in_delayed_cksum(m);
1496                                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1497                                         ip->ip_sum = in_cksum(m, hlen);
1498                                 }
1499
1500 #if BYTE_ORDER != BIG_ENDIAN
1501                                 HTONS(ip->ip_len);
1502                                 HTONS(ip->ip_off);
1503 #endif
1504
1505                                 /*  we need to call dlil_output to run filters
1506                                  *      and resync to avoid recursion loops.
1507                                  */
1508                                 if (lo_ifp) {
1509                                         dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0);
1510                                 }
1511                                 else {
1512                                         printf("ip_output: no loopback ifp for forwarding!!!\n");
1513                                 }
1514                                 goto done;
1515                         }
1516                         /* Some of the logic for this was
1517                          * nicked from above.
1518                          *
1519                          * This rewrites the cached route in a local PCB.
1520                          * Is this what we want to do?
1521                          */
1522                         bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
1523
1524                         ro_fwd->ro_rt = NULL;
1525                         rtalloc_ign(ro_fwd, RTF_PRCLONING);
1526
1527                         if (ro_fwd->ro_rt == NULL) {
1528                                 OSAddAtomic(1, &ipstat.ips_noroute);
1529                                 error = EHOSTUNREACH;
1530                                 goto bad;
1531                         }
1532
1533                         RT_LOCK_SPIN(ro_fwd->ro_rt);
1534                         ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1535                         if (ia_fw != NULL) {
1536                                 /* Become a regular mutex */
1537                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1538                                 IFA_ADDREF(&ia_fw->ia_ifa);
1539                         }
1540                         ifp = ro_fwd->ro_rt->rt_ifp;
1541                         ro_fwd->ro_rt->rt_use++;
1542                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1543                                 dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway;
1544                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1545                                 isbroadcast =
1546                                     (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1547                         } else {
1548                                 /* Become a regular mutex */
1549                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1550                                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
1551                         }
1552                         RT_UNLOCK(ro_fwd->ro_rt);
1553                         rtfree(ro->ro_rt);
1554                         ro->ro_rt = ro_fwd->ro_rt;
1555                         dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
1556
1557                         /*
1558                          * If we added a default src ip earlier,
1559                          * which would have been gotten from the-then
1560                          * interface, do it again, from the new one.
1561                          */
1562                         if (ia_fw != NULL) {
1563                                 if (fwd_rewrite_src) {
1564                                         IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1565                                         ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1566                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1567                                 }
1568                                 IFA_REMREF(&ia_fw->ia_ifa);
1569                         }
1570                         goto pass ;
1571                 }
1572 #endif /* IPFIREWALL_FORWARD */
1573                 /*
1574                  * if we get here, none of the above matches, and
1575                  * we have to drop the pkt
1576                  */
1577                 m_freem(m);
1578                 error = EACCES; /* not sure this is the right error msg */
1579                 goto done;
1580         }
1581
1582 pass:
1583 #endif /* IPFIREWALL */
1584 #if __APPLE__
1585         /* Do not allow loopback address to wind up on a wire */
1586         if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
1587                  ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1588                   (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1589                 OSAddAtomic(1, &ipstat.ips_badaddr);
1590                 m_freem(m);
1591                 /*
1592                  * Do not simply drop the packet just like a firewall -- we want the
1593                  * the application to feel the pain.
1594                  * Return ENETUNREACH like ip6_output does in some similar cases.
1595                  * This can startle the otherwise clueless process that specifies
1596                  * loopback as the source address.
1597                  */
1598                 error = ENETUNREACH;
1599                 goto done;
1600         }
1601 #endif
1602         m->m_pkthdr.csum_flags |= CSUM_IP;
1603         tso =  (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4);
1604
1605         sw_csum = m->m_pkthdr.csum_flags
1606                 & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1607
1608         if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) {
1609                 /*
1610                  * Special case code for GMACE
1611                  * frames that can be checksumed by GMACE SUM16 HW:
1612                  * frame >64, no fragments, no UDP
1613                  */
1614                 if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP)
1615                         && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) {
1616                         /* Apple GMAC HW, expects STUFF_OFFSET << 16  | START_OFFSET */
1617                         u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */
1618                         u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF;
1619                         m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */
1620                         m->m_pkthdr.csum_data = (csumprev + offset)  << 16 ;
1621                         m->m_pkthdr.csum_data += offset;
1622                 sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */
1623                 }
1624                 else {
1625                         /* let the software handle any UDP or TCP checksums */
1626                         sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
1627                 }
1628         } else if (apple_hwcksum_tx == 0) {
1629                 sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
1630                     m->m_pkthdr.csum_flags;
1631         }
1632
1633         if (sw_csum & CSUM_DELAY_DATA) {
1634                 in_delayed_cksum(m);
1635                 sw_csum &= ~CSUM_DELAY_DATA;
1636                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1637         }
1638
1639         if (apple_hwcksum_tx != 0) {
1640                 m->m_pkthdr.csum_flags &=
1641                     IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1642         } else {
1643                 m->m_pkthdr.csum_flags = 0;
1644         }
1645
1646         /*
1647          * If small enough for interface, or the interface will take
1648          * care of the fragmentation for us, can just send directly.
1649          */
1650         if ((u_short)ip->ip_len <= ifp->if_mtu || tso ||
1651             ifp->if_hwassist & CSUM_FRAGMENT) {
1652                 if (tso)
1653                         m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
1654
1655
1656 #if BYTE_ORDER != BIG_ENDIAN
1657                 HTONS(ip->ip_len);
1658                 HTONS(ip->ip_off);
1659 #endif
1660
1661                 ip->ip_sum = 0;
1662                 if (sw_csum & CSUM_DELAY_IP) {
1663                         ip->ip_sum = in_cksum(m, hlen);
1664                 }
1665
1666 #ifndef __APPLE__
1667                 /* Record statistics for this interface address. */
1668                 if (!(flags & IP_FORWARDING) && ia != NULL) {
1669                         ia->ia_ifa.if_opackets++;
1670                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1671                 }
1672 #endif
1673
1674 #if IPSEC
1675                 /* clean ipsec history once it goes out of the node */
1676                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1677                         ipsec_delaux(m);
1678 #endif
1679                 if (packetchain == 0) {
1680                         if (ro->ro_rt && nstat_collect)
1681                                 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1682                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1683                             (struct sockaddr *)dst);
1684                         goto done;
1685                 }
1686                 else { /* packet chaining allows us to reuse the route for all packets */
1687                         bytecnt += m->m_pkthdr.len;
1688                         mppn = &m->m_nextpkt;
1689                         m = m->m_nextpkt;
1690                         if (m == NULL) {
1691 #if PF
1692 sendchain:
1693 #endif /* PF */
1694                                 if (pktcnt > ip_maxchainsent)
1695                                         ip_maxchainsent = pktcnt;
1696                                 if (ro->ro_rt && nstat_collect)
1697                                         nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0);
1698                                 //send
1699                                 error = ifnet_output(ifp, PF_INET, packetlist,
1700                                     ro->ro_rt, (struct sockaddr *)dst);
1701                                 pktcnt = 0;
1702                                 bytecnt = 0;
1703                                 goto done;
1704
1705                         }
1706                         m0 = m;
1707                         pktcnt++;
1708                         goto loopit;
1709                 }
1710         }
1711         /*
1712          * Too large for interface; fragment if possible.
1713          * Must be able to put at least 8 bytes per fragment.
1714          */
1715
1716         if (ip->ip_off & IP_DF  || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) ||
1717                 pktcnt > 0) {
1718                 error = EMSGSIZE;
1719                 /*
1720                  * This case can happen if the user changed the MTU
1721                  * of an interface after enabling IP on it.  Because
1722                  * most netifs don't keep track of routes pointing to
1723                  * them, there is no way for one to update all its
1724                  * routes when the MTU is changed.
1725                  */
1726                 if (ro->ro_rt) {
1727                         RT_LOCK_SPIN(ro->ro_rt);
1728                         if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
1729                             && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
1730                             && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1731                                 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1732                         }
1733                         RT_UNLOCK(ro->ro_rt);
1734                 }
1735                 if (pktcnt > 0) {
1736                         m0 = packetlist;
1737                 }
1738                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1739                 goto bad;
1740         }
1741
1742         error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
1743         if (error != 0) {
1744                 m0 = m = NULL;
1745                 goto bad;
1746         }
1747
1748         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1749                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1750
1751         for (m = m0; m; m = m0) {
1752                 m0 = m->m_nextpkt;
1753                 m->m_nextpkt = 0;
1754 #if IPSEC
1755                 /* clean ipsec history once it goes out of the node */
1756                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1757                         ipsec_delaux(m);
1758 #endif
1759                 if (error == 0) {
1760 #ifndef __APPLE__
1761                         /* Record statistics for this interface address. */
1762                         if (ia != NULL) {
1763                                 ia->ia_ifa.if_opackets++;
1764                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1765                         }
1766 #endif
1767                         if ((packetchain != 0)  && (pktcnt > 0))
1768                                 panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist);
1769                         if (ro->ro_rt && nstat_collect)
1770                                 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1771                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1772                             (struct sockaddr *)dst);
1773                 } else
1774                         m_freem(m);
1775         }
1776
1777         if (error == 0)
1778                 OSAddAtomic(1, &ipstat.ips_fragmented);
1779
1780 done:
1781         if (ia) {
1782                 IFA_REMREF(&ia->ia_ifa);
1783                 ia = NULL;
1784         }
1785 #if IPSEC
1786         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
1787                 if (ipsec_state.ro.ro_rt)
1788                         rtfree(ipsec_state.ro.ro_rt);
1789         if (sp != NULL) {
1790                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1791                         printf("DP ip_output call free SP:%x\n", sp));
1792                 key_freesp(sp, KEY_SADB_UNLOCKED);
1793         }
1794         }
1795 #endif /* IPSEC */
1796
1797         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0);
1798         return (error);
1799 bad:
1800         m_freem(m0);
1801         goto done;
1802 }
1803
1804 int
1805 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
1806 {
1807         struct ip *ip, *mhip;
1808         int len, hlen, mhlen, firstlen, off, error = 0;
1809         struct mbuf **mnext = &m->m_nextpkt, *m0;
1810         int nfrags = 1;
1811
1812         ip = mtod(m, struct ip *);
1813 #ifdef _IP_VHL
1814         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1815 #else
1816         hlen = ip->ip_hl << 2;
1817 #endif
1818
1819         firstlen = len = (mtu - hlen) &~ 7;
1820         if (len < 8) {
1821                 m_freem(m);
1822                 return (EMSGSIZE);
1823         }
1824
1825         /*
1826          * if the interface will not calculate checksums on
1827          * fragmented packets, then do it here.
1828          */
1829         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1830             (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
1831                 in_delayed_cksum(m);
1832                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1833         }
1834
1835         /*
1836          * Loop through length of segment after first fragment,
1837          * make new header and copy data of each part and link onto chain.
1838          */
1839         m0 = m;
1840         mhlen = sizeof (struct ip);
1841         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1842                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1843                 if (m == 0) {
1844                         error = ENOBUFS;
1845                         OSAddAtomic(1, &ipstat.ips_odropped);
1846                         goto sendorfree;
1847                 }
1848                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1849                 m->m_data += max_linkhdr;
1850                 mhip = mtod(m, struct ip *);
1851                 *mhip = *ip;
1852                 if (hlen > sizeof (struct ip)) {
1853                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1854                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1855                 }
1856                 m->m_len = mhlen;
1857                 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
1858                 if (ip->ip_off & IP_MF)
1859                         mhip->ip_off |= IP_MF;
1860                 if (off + len >= (u_short)ip->ip_len)
1861                         len = (u_short)ip->ip_len - off;
1862                 else
1863                         mhip->ip_off |= IP_MF;
1864                 mhip->ip_len = htons((u_short)(len + mhlen));
1865                 m->m_next = m_copy(m0, off, len);
1866                 if (m->m_next == 0) {
1867                         (void) m_free(m);
1868                         error = ENOBUFS;        /* ??? */
1869                         OSAddAtomic(1, &ipstat.ips_odropped);
1870                         goto sendorfree;
1871                 }
1872                 m->m_pkthdr.len = mhlen + len;
1873                 m->m_pkthdr.rcvif = 0;
1874                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1875                 m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id;
1876 #if CONFIG_MACF_NET
1877                 mac_netinet_fragment(m0, m);
1878 #endif
1879
1880 #if BYTE_ORDER != BIG_ENDIAN
1881                 HTONS(mhip->ip_off);
1882 #endif
1883
1884                 mhip->ip_sum = 0;
1885                 if (sw_csum & CSUM_DELAY_IP) {
1886                         mhip->ip_sum = in_cksum(m, mhlen);
1887                 }
1888                 *mnext = m;
1889                 mnext = &m->m_nextpkt;
1890                 nfrags++;
1891         }
1892         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1893
1894         /* set first/last markers for fragment chain */
1895         m->m_flags |= M_LASTFRAG;
1896         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1897         m0->m_pkthdr.csum_data = nfrags;
1898
1899         /*
1900          * Update first fragment by trimming what's been copied out
1901          * and updating header, then send each fragment (in order).
1902          */
1903         m = m0;
1904         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1905         m->m_pkthdr.len = hlen + firstlen;
1906         ip->ip_len = htons((u_short)m->m_pkthdr.len);
1907         ip->ip_off |= IP_MF;
1908
1909 #if BYTE_ORDER != BIG_ENDIAN
1910         HTONS(ip->ip_off);
1911 #endif
1912
1913         ip->ip_sum = 0;
1914         if (sw_csum & CSUM_DELAY_IP) {
1915                 ip->ip_sum = in_cksum(m, hlen);
1916         }
1917 sendorfree:
1918         if (error)
1919                 m_freem_list(m0);
1920
1921         return (error);
1922 }
1923
1924 static void
1925 ip_out_cksum_stats(int proto, u_int32_t len)
1926 {
1927         switch (proto) {
1928         case IPPROTO_TCP:
1929                 tcp_out_cksum_stats(len);
1930                 break;
1931         case IPPROTO_UDP:
1932                 udp_out_cksum_stats(len);
1933                 break;
1934         default:
1935                 /* keep only TCP or UDP stats for now */
1936                 break;
1937         }
1938 }
1939
1940 void
1941 in_delayed_cksum_offset(struct mbuf *m0, int ip_offset)
1942 {
1943         struct ip *ip;
1944         unsigned char buf[sizeof(struct ip)];
1945         u_short csum, offset, ip_len;
1946
1947         /* Save copy of first mbuf pointer and the ip_offset before modifying */
1948         struct mbuf *m = m0;
1949         int ip_offset_copy = ip_offset;
1950
1951         while (ip_offset >= m->m_len) {
1952                 ip_offset -= m->m_len;
1953                 m = m->m_next;
1954                 if (m == NULL) {
1955                         printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n");
1956                         return;
1957                 }
1958         }
1959
1960         /* Sometimes the IP header is not contiguous, yes this can happen! */
1961         if (ip_offset + sizeof(struct ip) > m->m_len) {
1962 #if DEBUG
1963                 printf("delayed m_pullup, m->len: %d  off: %d\n",
1964                         m->m_len, ip_offset);
1965 #endif
1966                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
1967
1968                 ip = (struct ip *)buf;
1969         } else {
1970                 ip = (struct ip*)(m->m_data + ip_offset);
1971         }
1972
1973         /* Gross */
1974         if (ip_offset) {
1975                 m->m_len -= ip_offset;
1976                 m->m_data += ip_offset;
1977         }
1978
1979         offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
1980
1981         /*
1982          * We could be in the context of an IP or interface filter; in the
1983          * former case, ip_len would be in host (correct) order while for
1984          * the latter it would be in network order.  Because of this, we
1985          * attempt to interpret the length field by comparing it against
1986          * the actual packet length.  If the comparison fails, byte swap
1987          * the length and check again.  If it still fails, then the packet
1988          * is bogus and we give up.
1989          */
1990         ip_len = ip->ip_len;
1991         if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
1992                 ip_len = SWAP16(ip_len);
1993                 if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
1994                         printf("in_delayed_cksum_offset: ip_len %d (%d) "
1995                             "doesn't match actual length %d\n", ip->ip_len,
1996                             ip_len, (m0->m_pkthdr.len - ip_offset_copy));
1997                         return;
1998                 }
1999         }
2000
2001         csum = in_cksum_skip(m, ip_len, offset);
2002
2003         /* Update stats */
2004         ip_out_cksum_stats(ip->ip_p, ip_len - offset);
2005
2006         if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
2007                 csum = 0xffff;
2008         offset += m0->m_pkthdr.csum_data & 0xFFFF;        /* checksum offset */
2009
2010         /* Gross */
2011         if (ip_offset) {
2012                 if (M_LEADINGSPACE(m) < ip_offset)
2013                         panic("in_delayed_cksum_offset - chain modified!\n");
2014                 m->m_len += ip_offset;
2015                 m->m_data -= ip_offset;
2016         }
2017
2018         if (offset > ip_len) /* bogus offset */
2019                 return;
2020
2021         /* Insert the checksum in the existing chain */
2022         if (offset + ip_offset + sizeof(u_short) > m->m_len) {
2023                 char tmp[2];
2024
2025 #if DEBUG
2026                 printf("delayed m_copyback, m->len: %d  off: %d  p: %d\n",
2027                     m->m_len, offset + ip_offset, ip->ip_p);
2028 #endif
2029                 *(u_short *)tmp = csum;
2030                 m_copyback(m, offset + ip_offset, 2, tmp);
2031         } else
2032                 *(u_short *)(m->m_data + offset + ip_offset) = csum;
2033 }
2034
2035 void
2036 in_delayed_cksum(struct mbuf *m)
2037 {
2038         in_delayed_cksum_offset(m, 0);
2039 }
2040
2041 void
2042 in_cksum_offset(struct mbuf* m, size_t ip_offset)
2043 {
2044         struct ip* ip = NULL;
2045         int hlen = 0;
2046         unsigned char buf[sizeof(struct ip)];
2047         int swapped = 0;
2048
2049         /* Save copy of first mbuf pointer and the ip_offset before modifying */
2050         struct mbuf* m0 = m;
2051         size_t ip_offset_copy = ip_offset;
2052
2053         while (ip_offset >= m->m_len) {
2054                 ip_offset -= m->m_len;
2055                 m = m->m_next;
2056                 if (m == NULL) {
2057                         printf("in_cksum_offset failed - ip_offset wasn't in the packet\n");
2058                         return;
2059                 }
2060         }
2061
2062         /* Sometimes the IP header is not contiguous, yes this can happen! */
2063         if (ip_offset + sizeof(struct ip) > m->m_len) {
2064
2065 #if DEBUG
2066                 printf("in_cksum_offset - delayed m_pullup, m->len: %d  off: %lu\n",
2067                         m->m_len, ip_offset);
2068 #endif
2069                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
2070
2071                 ip = (struct ip *)buf;
2072                 ip->ip_sum = 0;
2073                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum);
2074         } else {
2075                 ip = (struct ip*)(m->m_data + ip_offset);
2076                 ip->ip_sum = 0;
2077         }
2078
2079         /* Gross */
2080         if (ip_offset) {
2081                 m->m_len -= ip_offset;
2082                 m->m_data += ip_offset;
2083         }
2084
2085 #ifdef _IP_VHL
2086         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2087 #else
2088         hlen = ip->ip_hl << 2;
2089 #endif
2090         /*
2091          * We could be in the context of an IP or interface filter; in the
2092          * former case, ip_len would be in host order while for the latter
2093          * it would be in network (correct) order.  Because of this, we
2094          * attempt to interpret the length field by comparing it against
2095          * the actual packet length.  If the comparison fails, byte swap
2096          * the length and check again.  If it still fails, then the packet
2097          * is bogus and we give up.
2098          */
2099         if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2100                 ip->ip_len = SWAP16(ip->ip_len);
2101                 swapped = 1;
2102                 if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2103                         ip->ip_len = SWAP16(ip->ip_len);
2104                         printf("in_cksum_offset: ip_len %d (%d) "
2105                             "doesn't match actual length %lu\n",
2106                             ip->ip_len, SWAP16(ip->ip_len),
2107                             (m0->m_pkthdr.len - ip_offset_copy));
2108                         return;
2109                 }
2110         }
2111
2112         ip->ip_sum = 0;
2113         ip->ip_sum = in_cksum(m, hlen);
2114         if (swapped)
2115                 ip->ip_len = SWAP16(ip->ip_len);
2116
2117         /* Gross */
2118         if (ip_offset) {
2119                 if (M_LEADINGSPACE(m) < ip_offset)
2120                         panic("in_cksum_offset - chain modified!\n");
2121                 m->m_len += ip_offset;
2122                 m->m_data -= ip_offset;
2123         }
2124
2125         /* Insert the checksum in the existing chain if IP header not contiguous */
2126         if (ip_offset + sizeof(struct ip) > m->m_len) {
2127                 char tmp[2];
2128
2129 #if DEBUG
2130                 printf("in_cksum_offset m_copyback, m->len: %u  off: %lu  p: %d\n",
2131                     m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p);
2132 #endif
2133                 *(u_short *)tmp = ip->ip_sum;
2134                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp);
2135         }
2136 }
2137
2138 /*
2139  * Insert IP options into preformed packet.
2140  * Adjust IP destination as required for IP source routing,
2141  * as indicated by a non-zero in_addr at the start of the options.
2142  *
2143  * XXX This routine assumes that the packet has no options in place.
2144  */
2145 static struct mbuf *
2146 ip_insertoptions(m, opt, phlen)
2147         register struct mbuf *m;
2148         struct mbuf *opt;
2149         int *phlen;
2150 {
2151         register struct ipoption *p = mtod(opt, struct ipoption *);
2152         struct mbuf *n;
2153         register struct ip *ip = mtod(m, struct ip *);
2154         unsigned optlen;
2155
2156         optlen = opt->m_len - sizeof(p->ipopt_dst);
2157         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
2158                 return (m);             /* XXX should fail */
2159         if (p->ipopt_dst.s_addr)
2160                 ip->ip_dst = p->ipopt_dst;
2161         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2162                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2163                 if (n == 0)
2164                         return (m);
2165                 n->m_pkthdr.rcvif = 0;
2166 #if CONFIG_MACF_NET
2167                 mac_mbuf_label_copy(m, n);
2168 #endif
2169                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2170                 m->m_len -= sizeof(struct ip);
2171                 m->m_data += sizeof(struct ip);
2172                 n->m_next = m;
2173                 m = n;
2174                 m->m_len = optlen + sizeof(struct ip);
2175                 m->m_data += max_linkhdr;
2176                 (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
2177         } else {
2178                 m->m_data -= optlen;
2179                 m->m_len += optlen;
2180                 m->m_pkthdr.len += optlen;
2181                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2182         }
2183         ip = mtod(m, struct ip *);
2184         bcopy(p->ipopt_list, ip + 1, optlen);
2185         *phlen = sizeof(struct ip) + optlen;
2186         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2187         ip->ip_len += optlen;
2188         return (m);
2189 }
2190
2191 /*
2192  * Copy options from ip to jp,
2193  * omitting those not copied during fragmentation.
2194  */
2195 int
2196 ip_optcopy(ip, jp)
2197         struct ip *ip, *jp;
2198 {
2199         register u_char *cp, *dp;
2200         int opt, optlen, cnt;
2201
2202         cp = (u_char *)(ip + 1);
2203         dp = (u_char *)(jp + 1);
2204         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2205         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2206                 opt = cp[0];
2207                 if (opt == IPOPT_EOL)
2208                         break;
2209                 if (opt == IPOPT_NOP) {
2210                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2211                         *dp++ = IPOPT_NOP;
2212                         optlen = 1;
2213                         continue;
2214                 }
2215 #if DIAGNOSTIC
2216                 if (cnt < IPOPT_OLEN + sizeof(*cp))
2217                         panic("malformed IPv4 option passed to ip_optcopy");
2218 #endif
2219                 optlen = cp[IPOPT_OLEN];
2220 #if DIAGNOSTIC
2221                 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2222                         panic("malformed IPv4 option passed to ip_optcopy");
2223 #endif
2224                 /* bogus lengths should have been caught by ip_dooptions */
2225                 if (optlen > cnt)
2226                         optlen = cnt;
2227                 if (IPOPT_COPIED(opt)) {
2228                         bcopy(cp, dp, optlen);
2229                         dp += optlen;
2230                 }
2231         }
2232         for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2233                 *dp++ = IPOPT_EOL;
2234         return (optlen);
2235 }
2236
2237 /*
2238  * IP socket option processing.
2239  */
2240 int
2241 ip_ctloutput(so, sopt)
2242         struct socket *so;
2243         struct sockopt *sopt;
2244 {
2245         struct  inpcb *inp = sotoinpcb(so);
2246         int     error, optval;
2247
2248         error = optval = 0;
2249         if (sopt->sopt_level != IPPROTO_IP) {
2250                 return (EINVAL);
2251         }
2252
2253         switch (sopt->sopt_dir) {
2254         case SOPT_SET:
2255                 switch (sopt->sopt_name) {
2256                 case IP_OPTIONS:
2257 #ifdef notyet
2258                 case IP_RETOPTS:
2259 #endif
2260                 {
2261                         struct mbuf *m;
2262                         if (sopt->sopt_valsize > MLEN) {
2263                                 error = EMSGSIZE;
2264                                 break;
2265                         }
2266                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2267                             MT_HEADER);
2268                         if (m == 0) {
2269                                 error = ENOBUFS;
2270                                 break;
2271                         }
2272                         m->m_len = sopt->sopt_valsize;
2273                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
2274                                             m->m_len);
2275                         if (error)
2276                                 break;
2277
2278                         return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
2279                                            m));
2280                 }
2281
2282                 case IP_TOS:
2283                 case IP_TTL:
2284                 case IP_RECVOPTS:
2285                 case IP_RECVRETOPTS:
2286                 case IP_RECVDSTADDR:
2287                 case IP_RECVIF:
2288                 case IP_RECVTTL:
2289 #if defined(NFAITH) && NFAITH > 0
2290                 case IP_FAITH:
2291 #endif
2292                 case IP_RECVPKTINFO:
2293                         error = sooptcopyin(sopt, &optval, sizeof optval,
2294                                             sizeof optval);
2295                         if (error)
2296                                 break;
2297
2298                         switch (sopt->sopt_name) {
2299                         case IP_TOS:
2300                                 inp->inp_ip_tos = optval;
2301                                 break;
2302
2303                         case IP_TTL:
2304                                 inp->inp_ip_ttl = optval;
2305                                 break;
2306 #define OPTSET(bit) \
2307         if (optval) \
2308                 inp->inp_flags |= bit; \
2309         else \
2310                 inp->inp_flags &= ~bit;
2311
2312                         case IP_RECVOPTS:
2313                                 OPTSET(INP_RECVOPTS);
2314                                 break;
2315
2316                         case IP_RECVRETOPTS:
2317                                 OPTSET(INP_RECVRETOPTS);
2318                                 break;
2319
2320                         case IP_RECVDSTADDR:
2321                                 OPTSET(INP_RECVDSTADDR);
2322                                 break;
2323
2324                         case IP_RECVIF:
2325                                 OPTSET(INP_RECVIF);
2326                                 break;
2327
2328                         case IP_RECVTTL:
2329                                 OPTSET(INP_RECVTTL);
2330                                 break;
2331
2332 #if defined(NFAITH) && NFAITH > 0
2333                         case IP_FAITH:
2334                                 OPTSET(INP_FAITH);
2335                                 break;
2336 #endif
2337                         case IP_RECVPKTINFO:
2338                                 OPTSET(INP_PKTINFO);
2339                                 break;
2340                         }
2341                         break;
2342 #undef OPTSET
2343
2344 #if CONFIG_FORCE_OUT_IFP
2345                 /*
2346                  * Apple private interface, similar to IP_BOUND_IF, except
2347                  * that the parameter is a NULL-terminated string containing
2348                  * the name of the network interface; an emptry string means
2349                  * unbind.  Applications are encouraged to use IP_BOUND_IF
2350                  * instead, as that is the current "official" API.
2351                  */
2352                 case IP_FORCE_OUT_IFP: {
2353                         char ifname[IFNAMSIZ];
2354                         unsigned int ifscope;
2355
2356                         /* This option is settable only for IPv4 */
2357                         if (!(inp->inp_vflag & INP_IPV4)) {
2358                                 error = EINVAL;
2359                                 break;
2360                         }
2361
2362                         /* Verify interface name parameter is sane */
2363                         if (sopt->sopt_valsize > sizeof(ifname)) {
2364                                 error = EINVAL;
2365                                 break;
2366                         }
2367
2368                         /* Copy the interface name */
2369                         if (sopt->sopt_valsize != 0) {
2370                                 error = sooptcopyin(sopt, ifname,
2371                                     sizeof (ifname), sopt->sopt_valsize);
2372                                 if (error)
2373                                         break;
2374                         }
2375
2376                         if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
2377                                 /* Unbind this socket from any interface */
2378                                 ifscope = IFSCOPE_NONE;
2379                         } else {
2380                                 ifnet_t ifp;
2381
2382                                 /* Verify name is NULL terminated */
2383                                 if (ifname[sopt->sopt_valsize - 1] != '\0') {
2384                                         error = EINVAL;
2385                                         break;
2386                                 }
2387
2388                                 /* Bail out if given bogus interface name */
2389                                 if (ifnet_find_by_name(ifname, &ifp) != 0) {
2390                                         error = ENXIO;
2391                                         break;
2392                                 }
2393
2394                                 /* Bind this socket to this interface */
2395                                 ifscope = ifp->if_index;
2396
2397                                 /*
2398                                  * Won't actually free; since we don't release
2399                                  * this later, we should do it now.
2400                                  */
2401                                 ifnet_release(ifp);
2402                         }
2403                         inp_bindif(inp, ifscope);
2404                 }
2405                 break;
2406 #endif
2407                 /*
2408                  * Multicast socket options are processed by the in_mcast
2409                  * module.
2410                  */
2411                 case IP_MULTICAST_IF:
2412                 case IP_MULTICAST_IFINDEX:
2413                 case IP_MULTICAST_VIF:
2414                 case IP_MULTICAST_TTL:
2415                 case IP_MULTICAST_LOOP:
2416                 case IP_ADD_MEMBERSHIP:
2417                 case IP_DROP_MEMBERSHIP:
2418                 case IP_ADD_SOURCE_MEMBERSHIP:
2419                 case IP_DROP_SOURCE_MEMBERSHIP:
2420                 case IP_BLOCK_SOURCE:
2421                 case IP_UNBLOCK_SOURCE:
2422                 case IP_MSFILTER:
2423                 case MCAST_JOIN_GROUP:
2424                 case MCAST_LEAVE_GROUP:
2425                 case MCAST_JOIN_SOURCE_GROUP:
2426                 case MCAST_LEAVE_SOURCE_GROUP:
2427                 case MCAST_BLOCK_SOURCE:
2428                 case MCAST_UNBLOCK_SOURCE:
2429                         error = inp_setmoptions(inp, sopt);
2430                         break;
2431
2432                 case IP_PORTRANGE:
2433                         error = sooptcopyin(sopt, &optval, sizeof optval,
2434                                             sizeof optval);
2435                         if (error)
2436                                 break;
2437
2438                         switch (optval) {
2439                         case IP_PORTRANGE_DEFAULT:
2440                                 inp->inp_flags &= ~(INP_LOWPORT);
2441                                 inp->inp_flags &= ~(INP_HIGHPORT);
2442                                 break;
2443
2444                         case IP_PORTRANGE_HIGH:
2445                                 inp->inp_flags &= ~(INP_LOWPORT);
2446                                 inp->inp_flags |= INP_HIGHPORT;
2447                                 break;
2448
2449                         case IP_PORTRANGE_LOW:
2450                                 inp->inp_flags &= ~(INP_HIGHPORT);
2451                                 inp->inp_flags |= INP_LOWPORT;
2452                                 break;
2453
2454                         default:
2455                                 error = EINVAL;
2456                                 break;
2457                         }
2458                         break;
2459
2460 #if IPSEC
2461                 case IP_IPSEC_POLICY:
2462                 {
2463                         caddr_t req = NULL;
2464                         size_t len = 0;
2465                         int priv;
2466                         struct mbuf *m;
2467                         int optname;
2468
2469                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2470                                 break;
2471                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2472                                 break;
2473                         priv = (proc_suser(sopt->sopt_p) == 0);
2474                         if (m) {
2475                                 req = mtod(m, caddr_t);
2476                                 len = m->m_len;
2477                         }
2478                         optname = sopt->sopt_name;
2479                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2480                         m_freem(m);
2481                         break;
2482                 }
2483 #endif /*IPSEC*/
2484
2485 #if TRAFFIC_MGT
2486                 case IP_TRAFFIC_MGT_BACKGROUND:
2487                 {
2488                         unsigned        background = 0;
2489                         error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background));
2490                         if (error)
2491                                 break;
2492
2493                         if (background) {
2494                                 socket_set_traffic_mgt_flags_locked(so,
2495                                     TRAFFIC_MGT_SO_BACKGROUND);
2496                         } else {
2497                                 socket_clear_traffic_mgt_flags_locked(so,
2498                                     TRAFFIC_MGT_SO_BACKGROUND);
2499                         }
2500
2501                         break;
2502                 }
2503 #endif /* TRAFFIC_MGT */
2504
2505                 /*
2506                  * On a multihomed system, scoped routing can be used to
2507                  * restrict the source interface used for sending packets.
2508                  * The socket option IP_BOUND_IF binds a particular AF_INET
2509                  * socket to an interface such that data sent on the socket
2510                  * is restricted to that interface.  This is unlike the
2511                  * SO_DONTROUTE option where the routing table is bypassed;
2512                  * therefore it allows for a greater flexibility and control
2513                  * over the system behavior, and does not place any restriction
2514                  * on the destination address type (e.g.  unicast, multicast,
2515                  * or broadcast if applicable) or whether or not the host is
2516                  * directly reachable.  Note that in the multicast transmit
2517                  * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2518                  * IP_BOUND_IF, since the former practically bypasses the
2519                  * routing table; in this case, IP_BOUND_IF sets the default
2520                  * interface used for sending multicast packets in the absence
2521                  * of an explicit multicast transmit interface.
2522                  */
2523                 case IP_BOUND_IF:
2524                         /* This option is settable only for IPv4 */
2525                         if (!(inp->inp_vflag & INP_IPV4)) {
2526                                 error = EINVAL;
2527                                 break;
2528                         }
2529
2530                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2531                             sizeof (optval));
2532
2533                         if (error)
2534                                 break;
2535
2536                         inp_bindif(inp, optval);
2537                         break;
2538
2539                 case IP_NO_IFT_CELLULAR:
2540                         /* This option is settable only for IPv4 */
2541                         if (!(inp->inp_vflag & INP_IPV4)) {
2542                                 error = EINVAL;
2543                                 break;
2544                         }
2545
2546                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2547                             sizeof (optval));
2548
2549                         if (error)
2550                                 break;
2551
2552                         error = inp_nocellular(inp, optval);
2553                         break;
2554
2555                 case IP_OUT_IF:
2556                         /* This option is not settable */
2557                         error = EINVAL;
2558                         break;
2559
2560                 default:
2561                         error = ENOPROTOOPT;
2562                         break;
2563                 }
2564                 break;
2565
2566         case SOPT_GET:
2567                 switch (sopt->sopt_name) {
2568                 case IP_OPTIONS:
2569                 case IP_RETOPTS:
2570                         if (inp->inp_options)
2571                                 error = sooptcopyout(sopt,
2572                                                      mtod(inp->inp_options,
2573                                                           char *),
2574                                                      inp->inp_options->m_len);
2575                         else
2576                                 sopt->sopt_valsize = 0;
2577                         break;
2578
2579                 case IP_TOS:
2580                 case IP_TTL:
2581                 case IP_RECVOPTS:
2582                 case IP_RECVRETOPTS:
2583                 case IP_RECVDSTADDR:
2584                 case IP_RECVIF:
2585                 case IP_RECVTTL:
2586                 case IP_PORTRANGE:
2587 #if defined(NFAITH) && NFAITH > 0
2588                 case IP_FAITH:
2589 #endif
2590                 case IP_RECVPKTINFO:
2591                         switch (sopt->sopt_name) {
2592
2593                         case IP_TOS:
2594                                 optval = inp->inp_ip_tos;
2595                                 break;
2596
2597                         case IP_TTL:
2598                                 optval = inp->inp_ip_ttl;
2599                                 break;
2600
2601 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2602
2603                         case IP_RECVOPTS:
2604                                 optval = OPTBIT(INP_RECVOPTS);
2605                                 break;
2606
2607                         case IP_RECVRETOPTS:
2608                                 optval = OPTBIT(INP_RECVRETOPTS);
2609                                 break;
2610
2611                         case IP_RECVDSTADDR:
2612                                 optval = OPTBIT(INP_RECVDSTADDR);
2613                                 break;
2614
2615                         case IP_RECVIF:
2616                                 optval = OPTBIT(INP_RECVIF);
2617                                 break;
2618
2619                         case IP_RECVTTL:
2620                                 optval = OPTBIT(INP_RECVTTL);
2621                                 break;
2622
2623                         case IP_PORTRANGE:
2624                                 if (inp->inp_flags & INP_HIGHPORT)
2625                                         optval = IP_PORTRANGE_HIGH;
2626                                 else if (inp->inp_flags & INP_LOWPORT)
2627                                         optval = IP_PORTRANGE_LOW;
2628                                 else
2629                                         optval = 0;
2630                                 break;
2631
2632 #if defined(NFAITH) && NFAITH > 0
2633                         case IP_FAITH:
2634                                 optval = OPTBIT(INP_FAITH);
2635                                 break;
2636 #endif
2637                         case IP_RECVPKTINFO:
2638                                 optval = OPTBIT(INP_PKTINFO);
2639                                 break;
2640                         }
2641                         error = sooptcopyout(sopt, &optval, sizeof optval);
2642                         break;
2643
2644                 case IP_MULTICAST_IF:
2645                 case IP_MULTICAST_IFINDEX:
2646                 case IP_MULTICAST_VIF:
2647                 case IP_MULTICAST_TTL:
2648                 case IP_MULTICAST_LOOP:
2649                 case IP_MSFILTER:
2650                         error = inp_getmoptions(inp, sopt);
2651                         break;
2652
2653 #if IPSEC
2654                 case IP_IPSEC_POLICY:
2655                 {
2656                         struct mbuf *m = NULL;
2657                         caddr_t req = NULL;
2658                         size_t len = 0;
2659
2660                         if (m != 0) {
2661                                 req = mtod(m, caddr_t);
2662                                 len = m->m_len;
2663                         }
2664                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
2665                         if (error == 0)
2666                                 error = soopt_mcopyout(sopt, m); /* XXX */
2667                         if (error == 0)
2668                                 m_freem(m);
2669                         break;
2670                 }
2671 #endif /*IPSEC*/
2672
2673 #if TRAFFIC_MGT
2674                 case IP_TRAFFIC_MGT_BACKGROUND:
2675                 {
2676                         unsigned        background = (so->so_traffic_mgt_flags &  TRAFFIC_MGT_SO_BACKGROUND);
2677                         return (sooptcopyout(sopt, &background, sizeof(background)));
2678                         break;
2679                 }
2680 #endif /* TRAFFIC_MGT */
2681
2682                 case IP_BOUND_IF:
2683                         if (inp->inp_flags & INP_BOUND_IF)
2684                                 optval = inp->inp_boundif;
2685                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2686                         break;
2687
2688                 case IP_NO_IFT_CELLULAR:
2689                         optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
2690                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2691                         break;
2692
2693                 case IP_OUT_IF:
2694                         optval = inp->inp_last_outif;
2695                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2696                         break;
2697
2698                 default:
2699                         error = ENOPROTOOPT;
2700                         break;
2701                 }
2702                 break;
2703         }
2704         return (error);
2705 }
2706
2707 /*
2708  * Set up IP options in pcb for insertion in output packets.
2709  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2710  * with destination address if source routed.
2711  */
2712 static int
2713 ip_pcbopts(
2714         __unused int optname,
2715         struct mbuf **pcbopt,
2716         register struct mbuf *m)
2717 {
2718         register int cnt, optlen;
2719         register u_char *cp;
2720         u_char opt;
2721
2722         /* turn off any old options */
2723         if (*pcbopt)
2724                 (void)m_free(*pcbopt);
2725         *pcbopt = 0;
2726         if (m == (struct mbuf *)0 || m->m_len == 0) {
2727                 /*
2728                  * Only turning off any previous options.
2729                  */
2730                 if (m)
2731                         (void)m_free(m);
2732                 return (0);
2733         }
2734
2735 #ifndef vax
2736         if (m->m_len % sizeof(int32_t))
2737                 goto bad;
2738 #endif
2739         /*
2740          * IP first-hop destination address will be stored before
2741          * actual options; move other options back
2742          * and clear it when none present.
2743          */
2744         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
2745                 goto bad;
2746         cnt = m->m_len;
2747         m->m_len += sizeof(struct in_addr);
2748         cp = mtod(m, u_char *) + sizeof(struct in_addr);
2749         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2750         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2751
2752         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2753                 opt = cp[IPOPT_OPTVAL];
2754                 if (opt == IPOPT_EOL)
2755                         break;
2756                 if (opt == IPOPT_NOP)
2757                         optlen = 1;
2758                 else {
2759                         if (cnt < IPOPT_OLEN + sizeof(*cp))
2760                                 goto bad;
2761                         optlen = cp[IPOPT_OLEN];
2762                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2763                                 goto bad;
2764                 }
2765                 switch (opt) {
2766
2767                 default:
2768                         break;
2769
2770                 case IPOPT_LSRR:
2771                 case IPOPT_SSRR:
2772                         /*
2773                          * user process specifies route as:
2774                          *      ->A->B->C->D
2775                          * D must be our final destination (but we can't
2776                          * check that since we may not have connected yet).
2777                          * A is first hop destination, which doesn't appear in
2778                          * actual IP option, but is stored before the options.
2779                          */
2780                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
2781                                 goto bad;
2782                         m->m_len -= sizeof(struct in_addr);
2783                         cnt -= sizeof(struct in_addr);
2784                         optlen -= sizeof(struct in_addr);
2785                         cp[IPOPT_OLEN] = optlen;
2786                         /*
2787                          * Move first hop before start of options.
2788                          */
2789                         bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2790                             sizeof(struct in_addr));
2791                         /*
2792                          * Then copy rest of options back
2793                          * to close up the deleted entry.
2794                          */
2795                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2796                             sizeof(struct in_addr)),
2797                             (caddr_t)&cp[IPOPT_OFFSET+1],
2798                             (unsigned)cnt + sizeof(struct in_addr));
2799                         break;
2800                 }
2801         }
2802         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
2803                 goto bad;
2804         *pcbopt = m;
2805         return (0);
2806
2807 bad:
2808         (void)m_free(m);
2809         return (EINVAL);
2810 }
2811
2812 void
2813 ip_moptions_init(void)
2814 {
2815         PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug));
2816
2817         imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) :
2818             sizeof (struct ip_moptions_dbg);
2819
2820         imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
2821             IMO_ZONE_NAME);
2822         if (imo_zone == NULL) {
2823                 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
2824                 /* NOTREACHED */
2825         }
2826         zone_change(imo_zone, Z_EXPAND, TRUE);
2827 }
2828
2829 void
2830 imo_addref(struct ip_moptions *imo, int locked)
2831 {
2832         if (!locked)
2833                 IMO_LOCK(imo);
2834         else
2835                 IMO_LOCK_ASSERT_HELD(imo);
2836
2837         if (++imo->imo_refcnt == 0) {
2838                 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
2839                 /* NOTREACHED */
2840         } else if (imo->imo_trace != NULL) {
2841                 (*imo->imo_trace)(imo, TRUE);
2842         }
2843
2844         if (!locked)
2845                 IMO_UNLOCK(imo);
2846 }
2847
2848 void
2849 imo_remref(struct ip_moptions *imo)
2850 {
2851         int i;
2852
2853         IMO_LOCK(imo);
2854         if (imo->imo_refcnt == 0) {
2855                 panic("%s: imo %p negative refcnt", __func__, imo);
2856                 /* NOTREACHED */
2857         } else if (imo->imo_trace != NULL) {
2858                 (*imo->imo_trace)(imo, FALSE);
2859         }
2860
2861         --imo->imo_refcnt;
2862         if (imo->imo_refcnt > 0) {
2863                 IMO_UNLOCK(imo);
2864                 return;
2865         }
2866
2867         for (i = 0; i < imo->imo_num_memberships; ++i) {
2868                 struct in_mfilter *imf;
2869
2870                 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
2871                 if (imf != NULL)
2872                         imf_leave(imf);
2873
2874                 (void) in_leavegroup(imo->imo_membership[i], imf);
2875
2876                 if (imf != NULL)
2877                         imf_purge(imf);
2878
2879                 INM_REMREF(imo->imo_membership[i]);
2880                 imo->imo_membership[i] = NULL;
2881         }
2882         imo->imo_num_memberships = 0;
2883         if (imo->imo_mfilters != NULL) {
2884                 FREE(imo->imo_mfilters, M_INMFILTER);
2885                 imo->imo_mfilters = NULL;
2886         }
2887         if (imo->imo_membership != NULL) {
2888                 FREE(imo->imo_membership, M_IPMOPTS);
2889                 imo->imo_membership = NULL;
2890         }
2891         IMO_UNLOCK(imo);
2892
2893         lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
2894
2895         if (!(imo->imo_debug & IFD_ALLOC)) {
2896                 panic("%s: imo %p cannot be freed", __func__, imo);
2897                 /* NOTREACHED */
2898         }
2899         zfree(imo_zone, imo);
2900 }
2901
2902 static void
2903 imo_trace(struct ip_moptions *imo, int refhold)
2904 {
2905         struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2906         ctrace_t *tr;
2907         u_int32_t idx;
2908         u_int16_t *cnt;
2909
2910         if (!(imo->imo_debug & IFD_DEBUG)) {
2911                 panic("%s: imo %p has no debug structure", __func__, imo);
2912                 /* NOTREACHED */
2913         }
2914         if (refhold) {
2915                 cnt = &imo_dbg->imo_refhold_cnt;
2916                 tr = imo_dbg->imo_refhold;
2917         } else {
2918                 cnt = &imo_dbg->imo_refrele_cnt;
2919                 tr = imo_dbg->imo_refrele;
2920         }
2921
2922         idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
2923         ctrace_record(&tr[idx]);
2924 }
2925
2926 struct ip_moptions *
2927 ip_allocmoptions(int how)
2928 {
2929         struct ip_moptions *imo;
2930
2931         imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
2932         if (imo != NULL) {
2933                 bzero(imo, imo_size);
2934                 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
2935                 imo->imo_debug |= IFD_ALLOC;
2936                 if (imo_debug != 0) {
2937                         imo->imo_debug |= IFD_DEBUG;
2938                         imo->imo_trace = imo_trace;
2939                 }
2940                 IMO_ADDREF(imo);
2941         }
2942
2943         return (imo);
2944 }
2945
2946 /*
2947  * Routine called from ip_output() to loop back a copy of an IP multicast
2948  * packet to the input queue of a specified interface.  Note that this
2949  * calls the output routine of the loopback "driver", but with an interface
2950  * pointer that might NOT be a loopback interface -- evil, but easier than
2951  * replicating that code here.
2952  */
2953 static void
2954 ip_mloopback(ifp, m, dst, hlen)
2955         struct ifnet *ifp;
2956         register struct mbuf *m;
2957         register struct sockaddr_in *dst;
2958         int hlen;
2959 {
2960         register struct ip *ip;
2961         struct mbuf *copym;
2962         int sw_csum = (apple_hwcksum_tx == 0);
2963
2964         copym = m_copy(m, 0, M_COPYALL);
2965         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2966                 copym = m_pullup(copym, hlen);
2967
2968         if (copym == NULL)
2969                 return;
2970
2971         /*
2972          * We don't bother to fragment if the IP length is greater
2973          * than the interface's MTU.  Can this possibly matter?
2974          */
2975         ip = mtod(copym, struct ip *);
2976
2977 #if BYTE_ORDER != BIG_ENDIAN
2978         HTONS(ip->ip_len);
2979         HTONS(ip->ip_off);
2980 #endif
2981
2982         ip->ip_sum = 0;
2983         ip->ip_sum = in_cksum(copym, hlen);
2984         /*
2985          * NB:
2986          * It's not clear whether there are any lingering
2987          * reentrancy problems in other areas which might
2988          * be exposed by using ip_input directly (in
2989          * particular, everything which modifies the packet
2990          * in-place).  Yet another option is using the
2991          * protosw directly to deliver the looped back
2992          * packet.  For the moment, we'll err on the side
2993          * of safety by using if_simloop().
2994          */
2995 #if 1 /* XXX */
2996         if (dst->sin_family != AF_INET) {
2997                 printf("ip_mloopback: bad address family %d\n",
2998                                         dst->sin_family);
2999                 dst->sin_family = AF_INET;
3000         }
3001 #endif
3002
3003         /*
3004          * Mark checksum as valid or calculate checksum for loopback.
3005          *
3006          * This is done this way because we have to embed the ifp of
3007          * the interface we will send the original copy of the packet
3008          * out on in the mbuf. ip_input will check if_hwassist of the
3009          * embedded ifp and ignore all csum_flags if if_hwassist is 0.
3010          * The UDP checksum has not been calculated yet.
3011          */
3012         if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) {
3013                 if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) {
3014                         copym->m_pkthdr.csum_flags |=
3015                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3016                             CSUM_IP_CHECKED | CSUM_IP_VALID;
3017                         copym->m_pkthdr.csum_data = 0xffff;
3018                 } else {
3019
3020 #if BYTE_ORDER != BIG_ENDIAN
3021                         NTOHS(ip->ip_len);
3022 #endif
3023
3024                         in_delayed_cksum(copym);
3025
3026 #if BYTE_ORDER != BIG_ENDIAN
3027                         HTONS(ip->ip_len);
3028 #endif
3029
3030                 }
3031         }
3032
3033         /*
3034          * TedW:
3035          * We need to send all loopback traffic down to dlil in case
3036          * a filter has tapped-in.
3037          */
3038
3039         /*
3040          * Stuff the 'real' ifp into the pkthdr, to be used in matching
3041          *  in ip_input(); we need the loopback ifp/dl_tag passed as args
3042          *  to make the loopback driver compliant with the data link
3043          *  requirements.
3044          */
3045         if (lo_ifp) {
3046                 copym->m_pkthdr.rcvif = ifp;
3047                 dlil_output(lo_ifp, PF_INET, copym, 0,
3048                     (struct sockaddr *) dst, 0);
3049         } else {
3050                 printf("Warning: ip_output call to dlil_find_dltag failed!\n");
3051                 m_freem(copym);
3052         }
3053 }
3054
3055 /*
3056  * Given a source IP address (and route, if available), determine the best
3057  * interface to send the packet from.  Checking for (and updating) the
3058  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3059  * without any locks based on the assumption that ip_output() is single-
3060  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3061  * performing output at the IP layer.
3062  *
3063  * This routine is analogous to in6_selectroute() for IPv6.
3064  */
3065 static struct ifaddr *
3066 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3067 {
3068         struct ifaddr *ifa = NULL;
3069         struct in_addr src = ip->ip_src;
3070         struct in_addr dst = ip->ip_dst;
3071         struct ifnet *rt_ifp;
3072         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3073
3074         if (ip_select_srcif_debug) {
3075                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3076                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3077         }
3078
3079         if (ro->ro_rt != NULL)
3080                 RT_LOCK(ro->ro_rt);
3081
3082         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3083
3084         /*
3085          * Given the source IP address, find a suitable source interface
3086          * to use for transmission; if the caller has specified a scope,
3087          * optimize the search by looking at the addresses only for that
3088          * interface.  This is still suboptimal, however, as we need to
3089          * traverse the per-interface list.
3090          */
3091         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3092                 unsigned int scope = ifscope;
3093
3094                 /*
3095                  * If no scope is specified and the route is stale (pointing
3096                  * to a defunct interface) use the current primary interface;
3097                  * this happens when switching between interfaces configured
3098                  * with the same IP address.  Otherwise pick up the scope
3099                  * information from the route; the ULP may have looked up a
3100                  * correct route and we just need to verify it here and mark
3101                  * it with the ROF_SRCIF_SELECTED flag below.
3102                  */
3103                 if (scope == IFSCOPE_NONE) {
3104                         scope = rt_ifp->if_index;
3105                         if (scope != get_primary_ifscope(AF_INET) &&
3106                             ro->ro_rt->generation_id != route_generation)
3107                                 scope = get_primary_ifscope(AF_INET);
3108                 }
3109
3110                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3111
3112                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3113                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3114                         /*
3115                          * If forwarding is enabled, and if the packet isn't
3116                          * TCP or UDP, check if the source address belongs
3117                          * to one of our own interfaces; if so, demote the
3118                          * interface scope and do a route lookup right below.
3119                          */
3120                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3121                         if (ifa != NULL) {
3122                                 IFA_REMREF(ifa);
3123                                 ifa = NULL;
3124                                 ifscope = IFSCOPE_NONE;
3125                         }
3126                 }
3127
3128                 if (ip_select_srcif_debug && ifa != NULL) {
3129                         if (ro->ro_rt != NULL) {
3130                                 printf("%s->%s ifscope %d->%d ifa_if %s "
3131                                     "ro_if %s\n", s_src, s_dst, ifscope,
3132                                     scope, if_name(ifa->ifa_ifp),
3133                                     if_name(rt_ifp));
3134                         } else {
3135                                 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3136                                     s_src, s_dst, ifscope, scope,
3137                                     if_name(ifa->ifa_ifp));
3138                         }
3139                 }
3140         }
3141
3142         /*
3143          * Slow path; search for an interface having the corresponding source
3144          * IP address if the scope was not specified by the caller, and:
3145          *
3146          *   1) There currently isn't any route, or,
3147          *   2) The interface used by the route does not own that source
3148          *      IP address; in this case, the route will get blown away
3149          *      and we'll do a more specific scoped search using the newly
3150          *      found interface.
3151          */
3152         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3153                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3154
3155                 /*
3156                  * If we have the IP address, but not the route, we don't
3157                  * really know whether or not it belongs to the correct
3158                  * interface (it could be shared across multiple interfaces.)
3159                  * The only way to find out is to do a route lookup.
3160                  */
3161                 if (ifa != NULL && ro->ro_rt == NULL) {
3162                         struct rtentry *rt;
3163                         struct sockaddr_in sin;
3164                         struct ifaddr *oifa = NULL;
3165
3166                         bzero(&sin, sizeof (sin));
3167                         sin.sin_family = AF_INET;
3168                         sin.sin_len = sizeof (sin);
3169                         sin.sin_addr = dst;
3170
3171                         lck_mtx_lock(rnh_lock);
3172                         if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL,
3173                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3174                                 RT_LOCK(rt);
3175                                 /*
3176                                  * If the route uses a different interface,
3177                                  * use that one instead.  The IP address of
3178                                  * the ifaddr that we pick up here is not
3179                                  * relevant.
3180                                  */
3181                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3182                                         oifa = ifa;
3183                                         ifa = rt->rt_ifa;
3184                                         IFA_ADDREF(ifa);
3185                                         RT_UNLOCK(rt);
3186                                 } else {
3187                                         RT_UNLOCK(rt);
3188                                 }
3189                                 rtfree_locked(rt);
3190                         }
3191                         lck_mtx_unlock(rnh_lock);
3192
3193                         if (oifa != NULL) {
3194                                 struct ifaddr *iifa;
3195
3196                                 /*
3197                                  * See if the interface pointed to by the
3198                                  * route is configured with the source IP
3199                                  * address of the packet.
3200                                  */
3201                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3202                                     src.s_addr, ifa->ifa_ifp->if_index);
3203
3204                                 if (iifa != NULL) {
3205                                         /*
3206                                          * Found it; drop the original one
3207                                          * as well as the route interface
3208                                          * address, and use this instead.
3209                                          */
3210                                         IFA_REMREF(oifa);
3211                                         IFA_REMREF(ifa);
3212                                         ifa = iifa;
3213                                 } else if (!ipforwarding ||
3214                                     (rt->rt_flags & RTF_GATEWAY)) {
3215                                         /*
3216                                          * This interface doesn't have that
3217                                          * source IP address; drop the route
3218                                          * interface address and just use the
3219                                          * original one, and let the caller
3220                                          * do a scoped route lookup.
3221                                          */
3222                                         IFA_REMREF(ifa);
3223                                         ifa = oifa;
3224                                 } else {
3225                                         /*
3226                                          * Forwarding is enabled and the source
3227                                          * address belongs to one of our own
3228                                          * interfaces which isn't the outgoing
3229                                          * interface, and we have a route, and
3230                                          * the destination is on a network that
3231                                          * is directly attached (onlink); drop
3232                                          * the original one and use the route
3233                                          * interface address instead.
3234                                          */
3235                                         IFA_REMREF(oifa);
3236                                 }
3237                         }
3238                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3239                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3240                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3241                         /*
3242                          * Forwarding is enabled and the source address belongs
3243                          * to one of our own interfaces which isn't the same
3244                          * as the interface used by the known route; drop the
3245                          * original one and use the route interface address.
3246                          */
3247                         IFA_REMREF(ifa);
3248                         ifa = ro->ro_rt->rt_ifa;
3249                         IFA_ADDREF(ifa);
3250                 }
3251
3252                 if (ip_select_srcif_debug && ifa != NULL) {
3253                         printf("%s->%s ifscope %d ifa_if %s\n",
3254                             s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3255                 }
3256         }
3257
3258         if (ro->ro_rt != NULL)
3259                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3260         /*
3261          * If there is a non-loopback route with the wrong interface, or if
3262          * there is no interface configured with such an address, blow it
3263          * away.  Except for local/loopback, we look for one with a matching
3264          * interface scope/index.
3265          */
3266         if (ro->ro_rt != NULL &&
3267             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3268             !(ro->ro_rt->rt_flags & RTF_UP))) {
3269                 if (ip_select_srcif_debug) {
3270                         if (ifa != NULL) {
3271                                 printf("%s->%s ifscope %d ro_if %s != "
3272                                     "ifa_if %s (cached route cleared)\n",
3273                                     s_src, s_dst, ifscope, if_name(rt_ifp),
3274                                     if_name(ifa->ifa_ifp));
3275                         } else {
3276                                 printf("%s->%s ifscope %d ro_if %s "
3277                                     "(no ifa_if found)\n",
3278                                     s_src, s_dst, ifscope, if_name(rt_ifp));
3279                         }
3280                 }
3281
3282                 RT_UNLOCK(ro->ro_rt);
3283                 rtfree(ro->ro_rt);
3284                 ro->ro_rt = NULL;
3285                 ro->ro_flags &= ~ROF_SRCIF_SELECTED;
3286
3287                 /*
3288                  * If the destination is IPv4 LLA and the route's interface
3289                  * doesn't match the source interface, then the source IP
3290                  * address is wrong; it most likely belongs to the primary
3291                  * interface associated with the IPv4 LL subnet.  Drop the
3292                  * packet rather than letting it go out and return an error
3293                  * to the ULP.  This actually applies not only to IPv4 LL
3294                  * but other shared subnets; for now we explicitly test only
3295                  * for the former case and save the latter for future.
3296                  */
3297                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3298                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3299                         IFA_REMREF(ifa);
3300                         ifa = NULL;
3301                 }
3302         }
3303
3304         if (ip_select_srcif_debug && ifa == NULL) {
3305                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3306                     s_src, s_dst, ifscope);
3307         }
3308
3309         /*
3310          * If there is a route, mark it accordingly.  If there isn't one,
3311          * we'll get here again during the next transmit (possibly with a
3312          * route) and the flag will get set at that point.  For IPv4 LLA
3313          * destination, mark it only if the route has been fully resolved;
3314          * otherwise we want to come back here again when the route points
3315          * to the interface over which the ARP reply arrives on.
3316          */
3317         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3318             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3319             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3320                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3321                 ro->ro_rt->generation_id = route_generation;
3322         }
3323
3324         if (ro->ro_rt != NULL)
3325                 RT_UNLOCK(ro->ro_rt);
3326
3327         return (ifa);
3328 }