bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  */
  62 /*
  63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  64  * support for mandatory and extensible security protections.  This notice
  65  * is included in support of clause 2.2 (b) of the Apple Public License,
  66  * Version 2.0.
  67  */
  68
  69 #define _IP_VHL
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/socketvar.h>
  79 #include <kern/locks.h>
  80 #include <sys/sysctl.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kdebug.h>
  83
  84 #include <machine/endian.h>
  85 #include <pexpert/pexpert.h>
  86 #include <mach/sdt.h>
  87
  88 #include <libkern/OSAtomic.h>
  89 #include <libkern/OSByteOrder.h>
  90
  91 #include <net/if.h>
  92 #include <net/if_dl.h>
  93 #include <net/if_types.h>
  94 #include <net/route.h>
  95 #include <net/ntstat.h>
  96 #include <net/net_osdep.h>
  97 #include <net/dlil.h>
  98
  99 #include <netinet/in.h>
 100 #include <netinet/in_systm.h>
 101 #include <netinet/ip.h>
 102 #include <netinet/in_pcb.h>
 103 #include <netinet/in_var.h>
 104 #include <netinet/ip_var.h>
 105 #include <netinet/kpi_ipfilter_var.h>
 106
 107 #if CONFIG_MACF_NET
 108 #include <security/mac_framework.h>
 109 #endif /* CONFIG_MACF_NET */
 110
 111 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 112 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 113 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 114 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 115
 116 #if IPSEC
 117 #include <netinet6/ipsec.h>
 118 #include <netkey/key.h>
 119 #if IPSEC_DEBUG
 120 #include <netkey/key_debug.h>
 121 #else
 122 #define KEYDEBUG(lev, arg)
 123 #endif
 124 #endif /* IPSEC */
 125
 126 #if IPFIREWALL
 127 #include <netinet/ip_fw.h>
 128 #if IPDIVERT
 129 #include <netinet/ip_divert.h>
 130 #endif /* IPDIVERT */
 131 #endif /* IPFIREWALL */
 132
 133 #if DUMMYNET
 134 #include <netinet/ip_dummynet.h>
 135 #endif
 136
 137 #if PF
 138 #include <net/pfvar.h>
 139 #endif /* PF */
 140
 141 #if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG
 142 #define print_ip(a)     \
 143         printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF,       \
 144             (ntohl(a.s_addr) >> 16) & 0xFF,                             \
 145             (ntohl(a.s_addr) >> 8) & 0xFF,                              \
 146             (ntohl(a.s_addr)) & 0xFF);
 147 #endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */
 148
 149 u_short ip_id;
 150
 151 static void ip_out_cksum_stats(int, u_int32_t);
 152 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 153 static int ip_optcopy(struct ip *, struct ip *);
 154 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
 155 static void imo_trace(struct ip_moptions *, int);
 156 static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
 157     struct sockaddr_in *, int);
 158 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 159
 160 extern struct ip_linklocal_stat ip_linklocal_stat;
 161
 162 /* temporary: for testing */
 163 #if IPSEC
 164 extern int ipsec_bypass;
 165 #endif
 166
 167 static int ip_maxchainsent = 0;
 168 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
 169         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
 170         "use dlil_output_list");
 171 #if DEBUG
 172 static int forge_ce = 0;
 173 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
 174         CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
 175         "Forge ECN CE");
 176 #endif /* DEBUG */
 177
 178 static int ip_select_srcif_debug = 0;
 179 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
 180         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
 181         "log source interface selection debug info");
 182
 183 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 184
 185 /* For gdb */
 186 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
 187
 188 struct ip_moptions_dbg {
 189         struct ip_moptions      imo;                    /* ip_moptions */
 190         u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
 191         u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
 192         /*
 193          * Alloc and free callers.
 194          */
 195         ctrace_t                imo_alloc;
 196         ctrace_t                imo_free;
 197         /*
 198          * Circular lists of IMO_ADDREF and IMO_REMREF callers.
 199          */
 200         ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
 201         ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
 202 };
 203
 204 #if DEBUG
 205 static unsigned int imo_debug = 1;      /* debugging (enabled) */
 206 #else
 207 static unsigned int imo_debug;          /* debugging (disabled) */
 208 #endif /* !DEBUG */
 209 static unsigned int imo_size;           /* size of zone element */
 210 static struct zone *imo_zone;           /* zone for ip_moptions */
 211
 212 #define IMO_ZONE_MAX            64              /* maximum elements in zone */
 213 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
 214
 215 /*
 216  * IP output.  The packet in mbuf chain m contains a skeletal IP
 217  * header (with len, off, ttl, proto, tos, src, dst).
 218  * The mbuf chain containing the packet will be freed.
 219  * The mbuf opt, if present, will not be freed.
 220  */
 221 int
 222 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
 223     struct ip_moptions *imo, struct ip_out_args *ipoa)
 224 {
 225         return (ip_output_list(m0, 0, opt, ro, flags, imo, ipoa));
 226 }
 227
 228 /*
 229  * IP output.  The packet in mbuf chain m contains a skeletal IP
 230  * header (with len, off, ttl, proto, tos, src, dst).
 231  * The mbuf chain containing the packet will be freed.
 232  * The mbuf opt, if present, will not be freed.
 233  *
 234  * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
 235  * skipped and ro->ro_rt would be used.  Otherwise the result of route
 236  * lookup is stored in ro->ro_rt.
 237  *
 238  * In the IP forwarding case, the packet will arrive with options already
 239  * inserted, so must have a NULL opt pointer.
 240  */
 241 int
 242 ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
 243     struct route *ro, int flags, struct ip_moptions *imo,
 244     struct ip_out_args *ipoa)
 245 {
 246         struct ip *ip;
 247         struct ifnet *ifp = NULL;               /* not refcnt'd */
 248         struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
 249         int hlen = sizeof (struct ip);
 250         int len = 0, error = 0;
 251         struct sockaddr_in *dst = NULL;
 252         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 253         struct in_addr pkt_dst;
 254         struct ipf_pktopts *ippo = NULL;
 255         ipfilter_t inject_filter_ref = NULL;
 256         struct mbuf *packetlist;
 257         uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
 258         unsigned int ifscope = IFSCOPE_NONE;
 259         struct flowadv *adv = NULL;
 260 #if IPSEC
 261         struct socket *so = NULL;
 262         struct secpolicy *sp = NULL;
 263 #endif /* IPSEC */
 264 #if IPFIREWALL
 265         int ipfwoff;
 266         struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
 267 #endif /* IPFIREWALL */
 268 #if IPFIREWALL || DUMMYNET
 269         struct m_tag *tag;
 270 #endif /* IPFIREWALL || DUMMYNET */
 271 #if DUMMYNET
 272         struct ip_out_args saved_ipoa;
 273         struct sockaddr_in dst_buf;
 274 #endif /* DUMMYNET */
 275         struct {
 276 #if IPSEC
 277                 struct ipsec_output_state ipsec_state;
 278 #endif /* IPSEC */
 279 #if IPFIREWALL || DUMMYNET
 280                 struct ip_fw_args args;
 281 #endif /* IPFIREWALL || DUMMYNET */
 282 #if IPFIREWALL_FORWARD
 283                 struct route sro_fwd;
 284 #endif /* IPFIREWALL_FORWARD */
 285 #if DUMMYNET
 286                 struct route saved_route;
 287 #endif /* DUMMYNET */
 288                 struct ipf_pktopts ipf_pktopts;
 289         } ipobz;
 290 #define ipsec_state     ipobz.ipsec_state
 291 #define args            ipobz.args
 292 #define sro_fwd         ipobz.sro_fwd
 293 #define saved_route     ipobz.saved_route
 294 #define ipf_pktopts     ipobz.ipf_pktopts
 295         union {
 296                 struct {
 297                         boolean_t select_srcif : 1;     /* set once */
 298                         boolean_t srcbound : 1;         /* set once */
 299                         boolean_t nocell : 1;           /* set once */
 300                         boolean_t isbroadcast : 1;
 301                         boolean_t didfilter : 1;
 302 #if IPFIREWALL_FORWARD
 303                         boolean_t fwd_rewrite_src : 1;
 304 #endif /* IPFIREWALL_FORWARD */
 305                 };
 306                 uint32_t raw;
 307         } ipobf = { .raw = 0 };
 308
 309         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 310
 311         VERIFY(m0->m_flags & M_PKTHDR);
 312         packetlist = m0;
 313
 314         /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
 315         bzero(&ipobz, sizeof (ipobz));
 316         ippo = &ipf_pktopts;
 317
 318 #if IPFIREWALL || DUMMYNET
 319         if (SLIST_EMPTY(&m0->m_pkthdr.tags))
 320                 goto ipfw_tags_done;
 321
 322         /* Grab info from mtags prepended to the chain */
 323 #if DUMMYNET
 324         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 325             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 326                 struct dn_pkt_tag       *dn_tag;
 327
 328                 dn_tag = (struct dn_pkt_tag *)(tag+1);
 329                 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
 330                 args.fwa_pf_rule = dn_tag->dn_pf_rule;
 331                 opt = NULL;
 332                 saved_route = dn_tag->dn_ro;
 333                 ro = &saved_route;
 334
 335                 imo = NULL;
 336                 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof (dst_buf));
 337                 dst = &dst_buf;
 338                 ifp = dn_tag->dn_ifp;
 339                 flags = dn_tag->dn_flags;
 340                 if ((dn_tag->dn_flags & IP_OUTARGS)) {
 341                         saved_ipoa = dn_tag->dn_ipoa;
 342                         ipoa = &saved_ipoa;
 343                 }
 344
 345                 m_tag_delete(m0, tag);
 346         }
 347 #endif /* DUMMYNET */
 348
 349 #if IPDIVERT
 350         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 351             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
 352                 struct divert_tag       *div_tag;
 353
 354                 div_tag = (struct divert_tag *)(tag+1);
 355                 args.fwa_divert_rule = div_tag->cookie;
 356
 357                 m_tag_delete(m0, tag);
 358         }
 359 #endif /* IPDIVERT */
 360
 361 #if IPFIREWALL
 362         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 363             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
 364                 struct ip_fwd_tag       *ipfwd_tag;
 365
 366                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
 367                 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
 368
 369                 m_tag_delete(m0, tag);
 370         }
 371 #endif /* IPFIREWALL */
 372
 373 ipfw_tags_done:
 374 #endif /* IPFIREWALL || DUMMYNET */
 375
 376         m = m0;
 377         m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP|PKTF_IFAINFO);
 378
 379 #if IPSEC
 380         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 381                 /* If packet is bound to an interface, check bound policies */
 382                 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
 383                     (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 384                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 385                         if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
 386                             &flags, ipoa, &sp) != 0)
 387                                 goto bad;
 388                 }
 389         }
 390 #endif /* IPSEC */
 391
 392         VERIFY(ro != NULL);
 393
 394         if (ip_doscopedroute && (flags & IP_OUTARGS)) {
 395                 /*
 396                  * In the forwarding case, only the ifscope value is used,
 397                  * as source interface selection doesn't take place.
 398                  */
 399                 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
 400                     (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
 401                         ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
 402                 }
 403
 404                 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 405                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 406                         ifscope = ipoa->ipoa_boundif;
 407                         ipf_pktopts.ippo_flags |=
 408                             (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
 409                 }
 410
 411                 /* double negation needed for bool bit field */
 412                 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
 413                 if (ipobf.srcbound)
 414                         ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
 415         } else {
 416                 ipobf.select_srcif = FALSE;
 417                 ipobf.srcbound = FALSE;
 418                 ifscope = IFSCOPE_NONE;
 419                 if (flags & IP_OUTARGS) {
 420                         ipoa->ipoa_boundif = IFSCOPE_NONE;
 421                         ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
 422                             IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
 423                 }
 424         }
 425
 426         if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) {
 427                 ipobf.nocell = TRUE;
 428                 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
 429         }
 430
 431         if (flags & IP_OUTARGS) {
 432                 adv = &ipoa->ipoa_flowadv;
 433                 adv->code = FADV_SUCCESS;
 434                 ipoa->ipoa_retflags = 0;
 435         }
 436
 437 #if DUMMYNET
 438         if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) {
 439                 /* dummynet already saw us */
 440                 ip = mtod(m, struct ip *);
 441                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 442                 pkt_dst = ip->ip_dst;
 443                 if (ro->ro_rt != NULL) {
 444                         RT_LOCK_SPIN(ro->ro_rt);
 445                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 446                         if (ia) {
 447                                 /* Become a regular mutex */
 448                                 RT_CONVERT_LOCK(ro->ro_rt);
 449                                 IFA_ADDREF(&ia->ia_ifa);
 450                         }
 451                         RT_UNLOCK(ro->ro_rt);
 452                 }
 453 #if IPSEC
 454                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 455                         so = ipsec_getsocket(m);
 456                         (void) ipsec_setsocket(m, NULL);
 457                 }
 458 #endif /* IPSEC */
 459 #if IPFIREWALL
 460                 if (args.fwa_ipfw_rule != NULL)
 461                         goto skip_ipsec;
 462 #endif /* IPFIREWALL  */
 463                 if (args.fwa_pf_rule != NULL)
 464                         goto sendit;
 465         }
 466 #endif /* DUMMYNET */
 467
 468 #if IPSEC
 469         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 470                 so = ipsec_getsocket(m);
 471                 (void) ipsec_setsocket(m, NULL);
 472         }
 473 #endif /* IPSEC */
 474
 475 loopit:
 476         ipobf.isbroadcast = FALSE;
 477         ipobf.didfilter = FALSE;
 478 #if IPFIREWALL_FORWARD
 479         ipobf.fwd_rewrite_src = FALSE;
 480 #endif /* IPFIREWALL_FORWARD */
 481
 482         VERIFY(m->m_flags & M_PKTHDR);
 483         /*
 484          * No need to proccess packet twice if we've already seen it.
 485          */
 486         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
 487                 inject_filter_ref = ipf_get_inject_filter(m);
 488         else
 489                 inject_filter_ref = NULL;
 490
 491         if (opt) {
 492                 m = ip_insertoptions(m, opt, &len);
 493                 hlen = len;
 494                 /* Update the chain */
 495                 if (m != m0) {
 496                         if (m0 == packetlist)
 497                                 packetlist = m;
 498                         m0 = m;
 499                 }
 500         }
 501         ip = mtod(m, struct ip *);
 502
 503 #if IPFIREWALL
 504         /*
 505          * rdar://8542331
 506          *
 507          * When dealing with a packet chain, we need to reset "next_hop"
 508          * because "dst" may have been changed to the gateway address below
 509          * for the previous packet of the chain. This could cause the route
 510          * to be inavertandly changed to the route to the gateway address
 511          * (instead of the route to the destination).
 512          */
 513         args.fwa_next_hop = next_hop_from_ipfwd_tag;
 514         pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst;
 515 #else /* !IPFIREWALL */
 516         pkt_dst = ip->ip_dst;
 517 #endif /* !IPFIREWALL */
 518
 519         /*
 520          * We must not send if the packet is destined to network zero.
 521          * RFC1122 3.2.1.3 (a) and (b).
 522          */
 523         if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
 524                 error = EHOSTUNREACH;
 525                 goto bad;
 526         }
 527
 528         /*
 529          * Fill in IP header.
 530          */
 531         if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) {
 532                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 533                 ip->ip_off &= IP_DF;
 534                 ip->ip_id = ip_randomid();
 535                 OSAddAtomic(1, &ipstat.ips_localout);
 536         } else {
 537                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 538         }
 539
 540 #if DEBUG
 541         /* For debugging, we let the stack forge congestion */
 542         if (forge_ce != 0 &&
 543             ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 544             (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 545                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 546                 forge_ce--;
 547         }
 548 #endif /* DEBUG */
 549
 550         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
 551             ip->ip_p, ip->ip_off, ip->ip_len);
 552
 553         dst = SIN(&ro->ro_dst);
 554
 555         /*
 556          * If there is a cached route,
 557          * check that it is to the same destination
 558          * and is still up.  If not, free it and try again.
 559          * The address family should also be checked in case of sharing the
 560          * cache with IPv6.
 561          */
 562
 563         if (ro->ro_rt != NULL) {
 564                 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
 565                     !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
 566                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 567                         if (src_ia == NULL) {
 568                                 error = EADDRNOTAVAIL;
 569                                 goto bad;
 570                         }
 571                         IFA_REMREF(&src_ia->ia_ifa);
 572                         src_ia = NULL;
 573                 }
 574                 /*
 575                  * Test rt_flags without holding rt_lock for performance
 576                  * reasons; if the route is down it will hopefully be
 577                  * caught by the layer below (since it uses this route
 578                  * as a hint) or during the next transmit.
 579                  */
 580                 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
 581                     dst->sin_addr.s_addr != pkt_dst.s_addr)
 582                         ROUTE_RELEASE(ro);
 583
 584                 /*
 585                  * If we're doing source interface selection, we may not
 586                  * want to use this route; only synch up the generation
 587                  * count otherwise.
 588                  */
 589                 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
 590                     RT_GENID_OUTOFSYNC(ro->ro_rt))
 591                         RT_GENID_SYNC(ro->ro_rt);
 592         }
 593         if (ro->ro_rt == NULL) {
 594                 bzero(dst, sizeof (*dst));
 595                 dst->sin_family = AF_INET;
 596                 dst->sin_len = sizeof (*dst);
 597                 dst->sin_addr = pkt_dst;
 598         }
 599         /*
 600          * If routing to interface only,
 601          * short circuit routing lookup.
 602          */
 603         if (flags & IP_ROUTETOIF) {
 604                 if (ia != NULL)
 605                         IFA_REMREF(&ia->ia_ifa);
 606                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
 607                         ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
 608                         if (ia == NULL) {
 609                                 OSAddAtomic(1, &ipstat.ips_noroute);
 610                                 error = ENETUNREACH;
 611                                 goto bad;
 612                         }
 613                 }
 614                 ifp = ia->ia_ifp;
 615                 ip->ip_ttl = 1;
 616                 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 617                 /*
 618                  * For consistency with other cases below.  Loopback
 619                  * multicast case is handled separately by ip_mloopback().
 620                  */
 621                 if ((ifp->if_flags & IFF_LOOPBACK) &&
 622                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 623                         m->m_pkthdr.rcvif = ifp;
 624                         ip_setsrcifaddr_info(m, ifp->if_index, NULL);
 625                         ip_setdstifaddr_info(m, ifp->if_index, NULL);
 626                 }
 627         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 628             imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
 629                 /*
 630                  * Bypass the normal routing lookup for multicast
 631                  * packets if the interface is specified.
 632                  */
 633                 ipobf.isbroadcast = FALSE;
 634                 if (ia != NULL)
 635                         IFA_REMREF(&ia->ia_ifa);
 636
 637                 /* Macro takes reference on ia */
 638                 IFP_TO_IA(ifp, ia);
 639         } else {
 640                 struct ifaddr *ia0 = NULL;
 641                 boolean_t cloneok = FALSE;
 642                 /*
 643                  * Perform source interface selection; the source IP address
 644                  * must belong to one of the addresses of the interface used
 645                  * by the route.  For performance reasons, do this only if
 646                  * there is no route, or if the routing table has changed,
 647                  * or if we haven't done source interface selection on this
 648                  * route (for this PCB instance) before.
 649                  */
 650                 if (ipobf.select_srcif &&
 651                     ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
 652                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 653                         /* Find the source interface */
 654                         ia0 = in_selectsrcif(ip, ro, ifscope);
 655
 656                         /*
 657                          * If the source address belongs to a cellular interface
 658                          * and the caller forbids our using interfaces of such
 659                          * type, pretend that there is no route.
 660                          */
 661                         if (ipobf.nocell && ia0 != NULL &&
 662                             IFNET_IS_CELLULAR(ia0->ifa_ifp)) {
 663                                 IFA_REMREF(ia0);
 664                                 ia0 = NULL;
 665                                 error = EHOSTUNREACH;
 666                                 if (flags & IP_OUTARGS)
 667                                         ipoa->ipoa_retflags |= IPOARF_IFDENIED;
 668                                 goto bad;
 669                         }
 670
 671                         /*
 672                          * If the source address is spoofed (in the case of
 673                          * IP_RAWOUTPUT on an unbounded socket), or if this
 674                          * is destined for local/loopback, just let it go out
 675                          * using the interface of the route.  Otherwise,
 676                          * there's no interface having such an address,
 677                          * so bail out.
 678                          */
 679                         if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
 680                             ipobf.srcbound) && ifscope != lo_ifp->if_index) {
 681                                 error = EADDRNOTAVAIL;
 682                                 goto bad;
 683                         }
 684
 685                         /*
 686                          * If the caller didn't explicitly specify the scope,
 687                          * pick it up from the source interface.  If the cached
 688                          * route was wrong and was blown away as part of source
 689                          * interface selection, don't mask out RTF_PRCLONING
 690                          * since that route may have been allocated by the ULP,
 691                          * unless the IP header was created by the caller or
 692                          * the destination is IPv4 LLA.  The check for the
 693                          * latter is needed because IPv4 LLAs are never scoped
 694                          * in the current implementation, and we don't want to
 695                          * replace the resolved IPv4 LLA route with one whose
 696                          * gateway points to that of the default gateway on
 697                          * the primary interface of the system.
 698                          */
 699                         if (ia0 != NULL) {
 700                                 if (ifscope == IFSCOPE_NONE)
 701                                         ifscope = ia0->ifa_ifp->if_index;
 702                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 703                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 704                         }
 705                 }
 706
 707                 /*
 708                  * If this is the case, we probably don't want to allocate
 709                  * a protocol-cloned route since we didn't get one from the
 710                  * ULP.  This lets TCP do its thing, while not burdening
 711                  * forwarding or ICMP with the overhead of cloning a route.
 712                  * Of course, we still want to do any cloning requested by
 713                  * the link layer, as this is probably required in all cases
 714                  * for correct operation (as it is for ARP).
 715                  */
 716                 if (ro->ro_rt == NULL) {
 717                         unsigned long ign = RTF_PRCLONING;
 718                         /*
 719                          * We make an exception here: if the destination
 720                          * address is INADDR_BROADCAST, allocate a protocol-
 721                          * cloned host route so that we end up with a route
 722                          * marked with the RTF_BROADCAST flag.  Otherwise,
 723                          * we would end up referring to the default route,
 724                          * instead of creating a cloned host route entry.
 725                          * That would introduce inconsistencies between ULPs
 726                          * that allocate a route and those that don't.  The
 727                          * RTF_BROADCAST route is important since we'd want
 728                          * to send out undirected IP broadcast packets using
 729                          * link-level broadcast address. Another exception
 730                          * is for ULP-created routes that got blown away by
 731                          * source interface selection (see above).
 732                          *
 733                          * These exceptions will no longer be necessary when
 734                          * the RTF_PRCLONING scheme is no longer present.
 735                          */
 736                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
 737                                 ign &= ~RTF_PRCLONING;
 738
 739                         /*
 740                          * Loosen the route lookup criteria if the ifscope
 741                          * corresponds to the loopback interface; this is
 742                          * needed to support Application Layer Gateways
 743                          * listening on loopback, in conjunction with packet
 744                          * filter redirection rules.  The final source IP
 745                          * address will be rewritten by the packet filter
 746                          * prior to the RFC1122 loopback check below.
 747                          */
 748                         if (ifscope == lo_ifp->if_index)
 749                                 rtalloc_ign(ro, ign);
 750                         else
 751                                 rtalloc_scoped_ign(ro, ign, ifscope);
 752
 753                         /*
 754                          * If the route points to a cellular interface and the
 755                          * caller forbids our using interfaces of such type,
 756                          * pretend that there is no route.
 757                          */
 758                         if (ipobf.nocell && ro->ro_rt != NULL) {
 759                                 RT_LOCK_SPIN(ro->ro_rt);
 760                                 if (IFNET_IS_CELLULAR(ro->ro_rt->rt_ifp)) {
 761                                         RT_UNLOCK(ro->ro_rt);
 762                                         ROUTE_RELEASE(ro);
 763                                         if (flags & IP_OUTARGS) {
 764                                                 ipoa->ipoa_retflags |=
 765                                                     IPOARF_IFDENIED;
 766                                         }
 767                                 } else {
 768                                         RT_UNLOCK(ro->ro_rt);
 769                                 }
 770                         }
 771                 }
 772
 773                 if (ro->ro_rt == NULL) {
 774                         OSAddAtomic(1, &ipstat.ips_noroute);
 775                         error = EHOSTUNREACH;
 776                         if (ia0 != NULL) {
 777                                 IFA_REMREF(ia0);
 778                                 ia0 = NULL;
 779                         }
 780                         goto bad;
 781                 }
 782
 783                 if (ia != NULL)
 784                         IFA_REMREF(&ia->ia_ifa);
 785                 RT_LOCK_SPIN(ro->ro_rt);
 786                 ia = ifatoia(ro->ro_rt->rt_ifa);
 787                 if (ia != NULL) {
 788                         /* Become a regular mutex */
 789                         RT_CONVERT_LOCK(ro->ro_rt);
 790                         IFA_ADDREF(&ia->ia_ifa);
 791                 }
 792                 /*
 793                  * Note: ia_ifp may not be the same as rt_ifp; the latter
 794                  * is what we use for determining outbound i/f, mtu, etc.
 795                  */
 796                 ifp = ro->ro_rt->rt_ifp;
 797                 ro->ro_rt->rt_use++;
 798                 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
 799                         dst = SIN(ro->ro_rt->rt_gateway);
 800                 }
 801                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 802                         /* double negation needed for bool bit field */
 803                         ipobf.isbroadcast =
 804                             !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
 805                 } else {
 806                         /* Become a regular mutex */
 807                         RT_CONVERT_LOCK(ro->ro_rt);
 808                         ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 809                 }
 810                 /*
 811                  * For consistency with IPv6, as well as to ensure that
 812                  * IP_RECVIF is set correctly for packets that are sent
 813                  * to one of the local addresses.  ia (rt_ifa) would have
 814                  * been fixed up by rt_setif for local routes.  This
 815                  * would make it appear as if the packet arrives on the
 816                  * interface which owns the local address.  Loopback
 817                  * multicast case is handled separately by ip_mloopback().
 818                  */
 819                 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
 820                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 821                         uint32_t srcidx;
 822
 823                         m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
 824
 825                         if (ia0 != NULL)
 826                                 srcidx = ia0->ifa_ifp->if_index;
 827                         else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
 828                             ro->ro_srcia != NULL)
 829                                 srcidx = ro->ro_srcia->ifa_ifp->if_index;
 830                         else
 831                                 srcidx = 0;
 832
 833                         ip_setsrcifaddr_info(m, srcidx, NULL);
 834                         ip_setdstifaddr_info(m, 0, ia);
 835                 }
 836                 RT_UNLOCK(ro->ro_rt);
 837                 if (ia0 != NULL) {
 838                         IFA_REMREF(ia0);
 839                         ia0 = NULL;
 840                 }
 841         }
 842
 843         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 844                 struct ifnet *srcifp = NULL;
 845                 struct in_multi *inm;
 846                 u_int32_t vif;
 847                 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
 848                 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 849
 850                 m->m_flags |= M_MCAST;
 851                 /*
 852                  * IP destination address is multicast.  Make sure "dst"
 853                  * still points to the address in "ro".  (It may have been
 854                  * changed to point to a gateway address, above.)
 855                  */
 856                 dst = SIN(&ro->ro_dst);
 857                 /*
 858                  * See if the caller provided any multicast options
 859                  */
 860                 if (imo != NULL) {
 861                         IMO_LOCK(imo);
 862                         vif = imo->imo_multicast_vif;
 863                         ttl = imo->imo_multicast_ttl;
 864                         loop = imo->imo_multicast_loop;
 865                         if (!(flags & IP_RAWOUTPUT))
 866                                 ip->ip_ttl = ttl;
 867                         if (imo->imo_multicast_ifp != NULL)
 868                                 ifp = imo->imo_multicast_ifp;
 869                         IMO_UNLOCK(imo);
 870 #if MROUTING
 871                         if (vif != -1 && (!(flags & IP_RAWOUTPUT) ||
 872                             ip->ip_src.s_addr == INADDR_ANY))
 873                                 ip->ip_src.s_addr = ip_mcast_src(vif);
 874 #endif /* MROUTING */
 875                 } else if (!(flags & IP_RAWOUTPUT)) {
 876                         vif = -1;
 877                         ip->ip_ttl = ttl;
 878                 }
 879                 /*
 880                  * Confirm that the outgoing interface supports multicast.
 881                  */
 882                 if (imo == NULL || vif == -1) {
 883                         if (!(ifp->if_flags & IFF_MULTICAST)) {
 884                                 OSAddAtomic(1, &ipstat.ips_noroute);
 885                                 error = ENETUNREACH;
 886                                 goto bad;
 887                         }
 888                 }
 889                 /*
 890                  * If source address not specified yet, use address
 891                  * of outgoing interface.
 892                  */
 893                 if (ip->ip_src.s_addr == INADDR_ANY) {
 894                         struct in_ifaddr *ia1;
 895                         lck_rw_lock_shared(in_ifaddr_rwlock);
 896                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
 897                                 IFA_LOCK_SPIN(&ia1->ia_ifa);
 898                                 if (ia1->ia_ifp == ifp) {
 899                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 900                                         srcifp = ifp;
 901                                         IFA_UNLOCK(&ia1->ia_ifa);
 902                                         break;
 903                                 }
 904                                 IFA_UNLOCK(&ia1->ia_ifa);
 905                         }
 906                         lck_rw_done(in_ifaddr_rwlock);
 907                         if (ip->ip_src.s_addr == INADDR_ANY) {
 908                                 error = ENETUNREACH;
 909                                 goto bad;
 910                         }
 911                 }
 912
 913                 in_multihead_lock_shared();
 914                 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
 915                 in_multihead_lock_done();
 916                 if (inm != NULL && (imo == NULL || loop)) {
 917                         /*
 918                          * If we belong to the destination multicast group
 919                          * on the outgoing interface, and the caller did not
 920                          * forbid loopback, loop back a copy.
 921                          */
 922                         if (!TAILQ_EMPTY(&ipv4_filters)) {
 923                                 struct ipfilter *filter;
 924                                 int seen = (inject_filter_ref == NULL);
 925
 926                                 if (imo != NULL) {
 927                                         ipf_pktopts.ippo_flags |=
 928                                             IPPOF_MCAST_OPTS;
 929                                         ipf_pktopts.ippo_mcast_ifnet = ifp;
 930                                         ipf_pktopts.ippo_mcast_ttl = ttl;
 931                                         ipf_pktopts.ippo_mcast_loop = loop;
 932                                 }
 933
 934                                 ipf_ref();
 935
 936                                 /*
 937                                  * 4135317 - always pass network byte
 938                                  * order to filter
 939                                  */
 940 #if BYTE_ORDER != BIG_ENDIAN
 941                                 HTONS(ip->ip_len);
 942                                 HTONS(ip->ip_off);
 943 #endif
 944                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 945                                         if (seen == 0) {
 946                                                 if ((struct ipfilter *)
 947                                                     inject_filter_ref == filter)
 948                                                         seen = 1;
 949                                         } else if (filter->ipf_filter.
 950                                             ipf_output != NULL) {
 951                                                 errno_t result;
 952                                                 result = filter->ipf_filter.
 953                                                     ipf_output(filter->
 954                                                     ipf_filter.cookie,
 955                                                     (mbuf_t *)&m, ippo);
 956                                                 if (result == EJUSTRETURN) {
 957                                                         ipf_unref();
 958                                                         INM_REMREF(inm);
 959                                                         goto done;
 960                                                 }
 961                                                 if (result != 0) {
 962                                                         ipf_unref();
 963                                                         INM_REMREF(inm);
 964                                                         goto bad;
 965                                                 }
 966                                         }
 967                                 }
 968
 969                                 /* set back to host byte order */
 970                                 ip = mtod(m, struct ip *);
 971 #if BYTE_ORDER != BIG_ENDIAN
 972                                 NTOHS(ip->ip_len);
 973                                 NTOHS(ip->ip_off);
 974 #endif
 975                                 ipf_unref();
 976                                 ipobf.didfilter = TRUE;
 977                         }
 978                         ip_mloopback(srcifp, ifp, m, dst, hlen);
 979                 }
 980 #if MROUTING
 981                 else {
 982                         /*
 983                          * If we are acting as a multicast router, perform
 984                          * multicast forwarding as if the packet had just
 985                          * arrived on the interface to which we are about
 986                          * to send.  The multicast forwarding function
 987                          * recursively calls this function, using the
 988                          * IP_FORWARDING flag to prevent infinite recursion.
 989                          *
 990                          * Multicasts that are looped back by ip_mloopback(),
 991                          * above, will be forwarded by the ip_input() routine,
 992                          * if necessary.
 993                          */
 994                         if (ip_mrouter && !(flags & IP_FORWARDING)) {
 995                                 /*
 996                                  * Check if rsvp daemon is running. If not,
 997                                  * don't set ip_moptions. This ensures that
 998                                  * the packet is multicast and not just sent
 999                                  * down one link as prescribed by rsvpd.
1000                                  */
1001                                 if (!rsvp_on)
1002                                         imo = NULL;
1003                                 if (ip_mforward(ip, ifp, m, imo) != 0) {
1004                                         m_freem(m);
1005                                         if (inm != NULL)
1006                                                 INM_REMREF(inm);
1007                                         OSAddAtomic(1, &ipstat.ips_cantforward);
1008                                         goto done;
1009                                 }
1010                         }
1011                 }
1012 #endif /* MROUTING */
1013                 if (inm != NULL)
1014                         INM_REMREF(inm);
1015                 /*
1016                  * Multicasts with a time-to-live of zero may be looped-
1017                  * back, above, but must not be transmitted on a network.
1018                  * Also, multicasts addressed to the loopback interface
1019                  * are not sent -- the above call to ip_mloopback() will
1020                  * loop back a copy if this host actually belongs to the
1021                  * destination group on the loopback interface.
1022                  */
1023                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1024                         m_freem(m);
1025                         goto done;
1026                 }
1027
1028                 goto sendit;
1029         }
1030         /*
1031          * If source address not specified yet, use address
1032          * of outgoing interface.
1033          */
1034         if (ip->ip_src.s_addr == INADDR_ANY) {
1035                 IFA_LOCK_SPIN(&ia->ia_ifa);
1036                 ip->ip_src = IA_SIN(ia)->sin_addr;
1037                 IFA_UNLOCK(&ia->ia_ifa);
1038 #if IPFIREWALL_FORWARD
1039                 /*
1040                  * Keep note that we did this - if the firewall changes
1041                  * the next-hop, our interface may change, changing the
1042                  * default source IP. It's a shame so much effort happens
1043                  * twice. Oh well.
1044                  */
1045                 ipobf.fwd_rewrite_src = TRUE;
1046 #endif /* IPFIREWALL_FORWARD */
1047         }
1048
1049         /*
1050          * Look for broadcast address and
1051          * and verify user is allowed to send
1052          * such a packet.
1053          */
1054         if (ipobf.isbroadcast) {
1055                 if (!(ifp->if_flags & IFF_BROADCAST)) {
1056                         error = EADDRNOTAVAIL;
1057                         goto bad;
1058                 }
1059                 if (!(flags & IP_ALLOWBROADCAST)) {
1060                         error = EACCES;
1061                         goto bad;
1062                 }
1063                 /* don't allow broadcast messages to be fragmented */
1064                 if ((u_short)ip->ip_len > ifp->if_mtu) {
1065                         error = EMSGSIZE;
1066                         goto bad;
1067                 }
1068                 m->m_flags |= M_BCAST;
1069         } else {
1070                 m->m_flags &= ~M_BCAST;
1071         }
1072
1073 sendit:
1074 #if PF
1075         /* Invoke outbound packet filter */
1076         if (PF_IS_ENABLED) {
1077                 int rc;
1078
1079                 m0 = m; /* Save for later */
1080 #if DUMMYNET
1081                 args.fwa_m = m;
1082                 args.fwa_next_hop = dst;
1083                 args.fwa_oif = ifp;
1084                 args.fwa_ro = ro;
1085                 args.fwa_dst = dst;
1086                 args.fwa_oflags = flags;
1087                 if (flags & IP_OUTARGS)
1088                         args.fwa_ipoa = ipoa;
1089                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args);
1090 #else /* DUMMYNET */
1091                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1092 #endif /* DUMMYNET */
1093                 if (rc != 0 || m == NULL) {
1094                         /* Move to the next packet */
1095                         m = *mppn;
1096
1097                         /* Skip ahead if first packet in list got dropped */
1098                         if (packetlist == m0)
1099                                 packetlist = m;
1100
1101                         if (m != NULL) {
1102                                 m0 = m;
1103                                 /* Next packet in the chain */
1104                                 goto loopit;
1105                         } else if (packetlist != NULL) {
1106                                 /* No more packet; send down the chain */
1107                                 goto sendchain;
1108                         }
1109                         /* Nothing left; we're done */
1110                         goto done;
1111                 }
1112                 m0 = m;
1113                 ip = mtod(m, struct ip *);
1114                 pkt_dst = ip->ip_dst;
1115                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1116         }
1117 #endif /* PF */
1118         /*
1119          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1120          */
1121         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1122             IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1123                 ip_linklocal_stat.iplls_out_total++;
1124                 if (ip->ip_ttl != MAXTTL) {
1125                         ip_linklocal_stat.iplls_out_badttl++;
1126                         ip->ip_ttl = MAXTTL;
1127                 }
1128         }
1129
1130         if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
1131                 struct ipfilter *filter;
1132                 int seen = (inject_filter_ref == NULL);
1133                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1134
1135                 /*
1136                  * Check that a TSO frame isn't passed to a filter.
1137                  * This could happen if a filter is inserted while
1138                  * TCP is sending the TSO packet.
1139                  */
1140                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1141                         error = EMSGSIZE;
1142                         goto bad;
1143                 }
1144
1145                 ipf_ref();
1146
1147                 /* 4135317 - always pass network byte order to filter */
1148 #if BYTE_ORDER != BIG_ENDIAN
1149                 HTONS(ip->ip_len);
1150                 HTONS(ip->ip_off);
1151 #endif
1152                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1153                         if (seen == 0) {
1154                                 if ((struct ipfilter *)inject_filter_ref ==
1155                                     filter)
1156                                         seen = 1;
1157                         } else if (filter->ipf_filter.ipf_output) {
1158                                 errno_t result;
1159                                 result = filter->ipf_filter.
1160                                     ipf_output(filter->ipf_filter.cookie,
1161                                     (mbuf_t *)&m, ippo);
1162                                 if (result == EJUSTRETURN) {
1163                                         ipf_unref();
1164                                         goto done;
1165                                 }
1166                                 if (result != 0) {
1167                                         ipf_unref();
1168                                         goto bad;
1169                                 }
1170                         }
1171                 }
1172                 /* set back to host byte order */
1173                 ip = mtod(m, struct ip *);
1174 #if BYTE_ORDER != BIG_ENDIAN
1175                 NTOHS(ip->ip_len);
1176                 NTOHS(ip->ip_off);
1177 #endif
1178                 ipf_unref();
1179         }
1180
1181 #if IPSEC
1182         /* temporary for testing only: bypass ipsec alltogether */
1183
1184         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC))
1185                 goto skip_ipsec;
1186
1187         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1188
1189         /* May have been set above if packet was bound */
1190         if (sp == NULL) {
1191                 /* get SP for this packet */
1192                 if (so == NULL)
1193                         sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1194                             flags, &error);
1195                 else
1196                         sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1197                             so, &error);
1198
1199                 if (sp == NULL) {
1200                         IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1201                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1202                             0, 0, 0, 0, 0);
1203                         goto bad;
1204                 }
1205         }
1206
1207         error = 0;
1208
1209         /* check policy */
1210         switch (sp->policy) {
1211         case IPSEC_POLICY_DISCARD:
1212         case IPSEC_POLICY_GENERATE:
1213                 /*
1214                  * This packet is just discarded.
1215                  */
1216                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1217                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1218                     1, 0, 0, 0, 0);
1219                 goto bad;
1220
1221         case IPSEC_POLICY_BYPASS:
1222         case IPSEC_POLICY_NONE:
1223                 /* no need to do IPsec. */
1224                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1225                     2, 0, 0, 0, 0);
1226                 goto skip_ipsec;
1227
1228         case IPSEC_POLICY_IPSEC:
1229                 if (sp->req == NULL) {
1230                         /* acquire a policy */
1231                         error = key_spdacquire(sp);
1232                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1233                             3, 0, 0, 0, 0);
1234                         goto bad;
1235                 }
1236                 if (sp->ipsec_if) {
1237                         /* Verify the redirect to ipsec interface */
1238                         if (sp->ipsec_if == ifp) {
1239                                 /* Set policy for mbuf */
1240                                 m->m_pkthdr.ipsec_policy = sp->id;
1241                                 goto skip_ipsec;
1242                         }
1243                         goto bad;
1244                 }
1245                 break;
1246
1247         case IPSEC_POLICY_ENTRUST:
1248         default:
1249                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1250         }
1251         {
1252         ipsec_state.m = m;
1253         if (flags & IP_ROUTETOIF) {
1254                 bzero(&ipsec_state.ro, sizeof (ipsec_state.ro));
1255         } else {
1256                 route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro));
1257         }
1258         ipsec_state.dst = SA(dst);
1259
1260         ip->ip_sum = 0;
1261
1262         /*
1263          * XXX
1264          * delayed checksums are not currently compatible with IPsec
1265          */
1266         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
1267                 in_delayed_cksum(m);
1268
1269 #if BYTE_ORDER != BIG_ENDIAN
1270         HTONS(ip->ip_len);
1271         HTONS(ip->ip_off);
1272 #endif
1273
1274         DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1275             struct ip *, ip, struct ifnet *, ifp,
1276             struct ip *, ip, struct ip6_hdr *, NULL);
1277
1278         error = ipsec4_output(&ipsec_state, sp, flags);
1279
1280         m0 = m = ipsec_state.m;
1281
1282 #if DUMMYNET
1283         /*
1284          * If we're about to use the route in ipsec_state
1285          * and this came from dummynet, cleaup now.
1286          */
1287         if (ro == &saved_route &&
1288             (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled))
1289                 ROUTE_RELEASE(ro);
1290 #endif /* DUMMYNET */
1291
1292         if (flags & IP_ROUTETOIF) {
1293                 /*
1294                  * if we have tunnel mode SA, we may need to ignore
1295                  * IP_ROUTETOIF.
1296                  */
1297                 if (ipsec_state.tunneled) {
1298                         flags &= ~IP_ROUTETOIF;
1299                         ro = &ipsec_state.ro;
1300                 }
1301         } else {
1302                 ro = &ipsec_state.ro;
1303         }
1304         dst = SIN(ipsec_state.dst);
1305         if (error) {
1306                 /* mbuf is already reclaimed in ipsec4_output. */
1307                 m0 = NULL;
1308                 switch (error) {
1309                 case EHOSTUNREACH:
1310                 case ENETUNREACH:
1311                 case EMSGSIZE:
1312                 case ENOBUFS:
1313                 case ENOMEM:
1314                         break;
1315                 default:
1316                         printf("ip4_output (ipsec): error code %d\n", error);
1317                         /* FALLTHRU */
1318                 case ENOENT:
1319                         /* don't show these error codes to the user */
1320                         error = 0;
1321                         break;
1322                 }
1323                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1324                     4, 0, 0, 0, 0);
1325                 goto bad;
1326         }
1327         }
1328
1329         /* be sure to update variables that are affected by ipsec4_output() */
1330         ip = mtod(m, struct ip *);
1331
1332 #ifdef _IP_VHL
1333         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1334 #else /* !_IP_VHL */
1335         hlen = ip->ip_hl << 2;
1336 #endif /* !_IP_VHL */
1337         /* Check that there wasn't a route change and src is still valid */
1338         if (ROUTE_UNUSABLE(ro)) {
1339                 ROUTE_RELEASE(ro);
1340                 VERIFY(src_ia == NULL);
1341                 if (ip->ip_src.s_addr != INADDR_ANY &&
1342                     !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1343                     (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1344                         error = EADDRNOTAVAIL;
1345                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1346                             5, 0, 0, 0, 0);
1347                         goto bad;
1348                 }
1349                 if (src_ia != NULL) {
1350                         IFA_REMREF(&src_ia->ia_ifa);
1351                         src_ia = NULL;
1352                 }
1353         }
1354
1355         if (ro->ro_rt == NULL) {
1356                 if (!(flags & IP_ROUTETOIF)) {
1357                         printf("%s: can't update route after "
1358                             "IPsec processing\n", __func__);
1359                         error = EHOSTUNREACH;   /* XXX */
1360                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1361                             6, 0, 0, 0, 0);
1362                         goto bad;
1363                 }
1364         } else {
1365                 if (ia != NULL)
1366                         IFA_REMREF(&ia->ia_ifa);
1367                 RT_LOCK_SPIN(ro->ro_rt);
1368                 ia = ifatoia(ro->ro_rt->rt_ifa);
1369                 if (ia != NULL) {
1370                         /* Become a regular mutex */
1371                         RT_CONVERT_LOCK(ro->ro_rt);
1372                         IFA_ADDREF(&ia->ia_ifa);
1373                 }
1374                 ifp = ro->ro_rt->rt_ifp;
1375                 RT_UNLOCK(ro->ro_rt);
1376         }
1377
1378         /* make it flipped, again. */
1379 #if BYTE_ORDER != BIG_ENDIAN
1380         NTOHS(ip->ip_len);
1381         NTOHS(ip->ip_off);
1382 #endif
1383         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1384             7, 0xff, 0xff, 0xff, 0xff);
1385
1386         /* Pass to filters again */
1387         if (!TAILQ_EMPTY(&ipv4_filters)) {
1388                 struct ipfilter *filter;
1389
1390                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1391
1392                 /*
1393                  * Check that a TSO frame isn't passed to a filter.
1394                  * This could happen if a filter is inserted while
1395                  * TCP is sending the TSO packet.
1396                  */
1397                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1398                         error = EMSGSIZE;
1399                         goto bad;
1400                 }
1401
1402                 ipf_ref();
1403
1404                 /* 4135317 - always pass network byte order to filter */
1405 #if BYTE_ORDER != BIG_ENDIAN
1406                 HTONS(ip->ip_len);
1407                 HTONS(ip->ip_off);
1408 #endif
1409                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1410                         if (filter->ipf_filter.ipf_output) {
1411                                 errno_t result;
1412                                 result = filter->ipf_filter.
1413                                     ipf_output(filter->ipf_filter.cookie,
1414                                     (mbuf_t *)&m, ippo);
1415                                 if (result == EJUSTRETURN) {
1416                                         ipf_unref();
1417                                         goto done;
1418                                 }
1419                                 if (result != 0) {
1420                                         ipf_unref();
1421                                         goto bad;
1422                                 }
1423                         }
1424                 }
1425                 /* set back to host byte order */
1426                 ip = mtod(m, struct ip *);
1427 #if BYTE_ORDER != BIG_ENDIAN
1428                 NTOHS(ip->ip_len);
1429                 NTOHS(ip->ip_off);
1430 #endif
1431                 ipf_unref();
1432         }
1433 skip_ipsec:
1434 #endif /* IPSEC */
1435
1436 #if IPFIREWALL
1437         /*
1438          * Check with the firewall...
1439          * but not if we are already being fwd'd from a firewall.
1440          */
1441         if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) {
1442                 struct sockaddr_in *old = dst;
1443
1444                 args.fwa_m = m;
1445                 args.fwa_next_hop = dst;
1446                 args.fwa_oif = ifp;
1447                 ipfwoff = ip_fw_chk_ptr(&args);
1448                 m = args.fwa_m;
1449                 dst = args.fwa_next_hop;
1450
1451                 /*
1452                  * On return we must do the following:
1453                  *   IP_FW_PORT_DENY_FLAG         -> drop the pkt (XXX new)
1454                  *   1<=off<= 0xffff              -> DIVERT
1455                  *   (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1456                  *   (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
1457                  *   dst != old                   -> IPFIREWALL_FORWARD
1458                  *   off==0, dst==old             -> accept
1459                  * If some of the above modules is not compiled in, then
1460                  * we should't have to check the corresponding condition
1461                  * (because the ipfw control socket should not accept
1462                  * unsupported rules), but better play safe and drop
1463                  * packets in case of doubt.
1464                  */
1465                 m0 = m;
1466                 if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1467                         if (m)
1468                                 m_freem(m);
1469                         error = EACCES;
1470                         goto done;
1471                 }
1472                 ip = mtod(m, struct ip *);
1473
1474                 if (ipfwoff == 0 && dst == old) {       /* common case */
1475                         goto pass;
1476                 }
1477 #if DUMMYNET
1478                 if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) {
1479                         /*
1480                          * pass the pkt to dummynet. Need to include
1481                          * pipe number, m, ifp, ro, dst because these are
1482                          * not recomputed in the next pass.
1483                          * All other parameters have been already used and
1484                          * so they are not needed anymore.
1485                          * XXX note: if the ifp or ro entry are deleted
1486                          * while a pkt is in dummynet, we are in trouble!
1487                          */
1488                         args.fwa_ro = ro;
1489                         args.fwa_dst = dst;
1490                         args.fwa_oflags = flags;
1491                         if (flags & IP_OUTARGS)
1492                                 args.fwa_ipoa = ipoa;
1493
1494                         error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT,
1495                             &args, DN_CLIENT_IPFW);
1496                         goto done;
1497                 }
1498 #endif /* DUMMYNET */
1499 #if IPDIVERT
1500                 if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) {
1501                         struct mbuf *clone = NULL;
1502
1503                         /* Clone packet if we're doing a 'tee' */
1504                         if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0)
1505                                 clone = m_dup(m, M_DONTWAIT);
1506                         /*
1507                          * XXX
1508                          * delayed checksums are not currently compatible
1509                          * with divert sockets.
1510                          */
1511                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
1512                                 in_delayed_cksum(m);
1513
1514                         /* Restore packet header fields to original values */
1515
1516 #if BYTE_ORDER != BIG_ENDIAN
1517                         HTONS(ip->ip_len);
1518                         HTONS(ip->ip_off);
1519 #endif
1520
1521                         /* Deliver packet to divert input routine */
1522                         divert_packet(m, 0, ipfwoff & 0xffff,
1523                             args.fwa_divert_rule);
1524
1525                         /* If 'tee', continue with original packet */
1526                         if (clone != NULL) {
1527                                 m0 = m = clone;
1528                                 ip = mtod(m, struct ip *);
1529                                 goto pass;
1530                         }
1531                         goto done;
1532                 }
1533 #endif /* IPDIVERT */
1534 #if IPFIREWALL_FORWARD
1535                 /*
1536                  * Here we check dst to make sure it's directly reachable on
1537                  * the interface we previously thought it was.
1538                  * If it isn't (which may be likely in some situations) we have
1539                  * to re-route it (ie, find a route for the next-hop and the
1540                  * associated interface) and set them here. This is nested
1541                  * forwarding which in most cases is undesirable, except where
1542                  * such control is nigh impossible. So we do it here.
1543                  * And I'm babbling.
1544                  */
1545                 if (ipfwoff == 0 && old != dst) {
1546                         struct in_ifaddr *ia_fw;
1547                         struct route *ro_fwd = &sro_fwd;
1548
1549 #if IPFIREWALL_FORWARD_DEBUG
1550                         printf("IPFIREWALL_FORWARD: New dst ip: ");
1551                         print_ip(dst->sin_addr);
1552                         printf("\n");
1553 #endif /* IPFIREWALL_FORWARD_DEBUG */
1554                         /*
1555                          * We need to figure out if we have been forwarded
1556                          * to a local socket. If so then we should somehow
1557                          * "loop back" to ip_input, and get directed to the
1558                          * PCB as if we had received this packet. This is
1559                          * because it may be dificult to identify the packets
1560                          * you want to forward until they are being output
1561                          * and have selected an interface. (e.g. locally
1562                          * initiated packets) If we used the loopback inteface,
1563                          * we would not be able to control what happens
1564                          * as the packet runs through ip_input() as
1565                          * it is done through a ISR.
1566                          */
1567                         lck_rw_lock_shared(in_ifaddr_rwlock);
1568                         TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1569                                 /*
1570                                  * If the addr to forward to is one
1571                                  * of ours, we pretend to
1572                                  * be the destination for this packet.
1573                                  */
1574                                 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1575                                 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1576                                     dst->sin_addr.s_addr) {
1577                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1578                                         break;
1579                                 }
1580                                 IFA_UNLOCK(&ia_fw->ia_ifa);
1581                         }
1582                         lck_rw_done(in_ifaddr_rwlock);
1583                         if (ia_fw) {
1584                                 /* tell ip_input "dont filter" */
1585                                 struct m_tag            *fwd_tag;
1586                                 struct ip_fwd_tag       *ipfwd_tag;
1587
1588                                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1589                                     KERNEL_TAG_TYPE_IPFORWARD,
1590                                     sizeof (*ipfwd_tag), M_NOWAIT, m);
1591                                 if (fwd_tag == NULL) {
1592                                         error = ENOBUFS;
1593                                         goto bad;
1594                                 }
1595
1596                                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1597                                 ipfwd_tag->next_hop = args.fwa_next_hop;
1598
1599                                 m_tag_prepend(m, fwd_tag);
1600
1601                                 if (m->m_pkthdr.rcvif == NULL)
1602                                         m->m_pkthdr.rcvif = lo_ifp;
1603
1604 #if BYTE_ORDER != BIG_ENDIAN
1605                                 HTONS(ip->ip_len);
1606                                 HTONS(ip->ip_off);
1607 #endif
1608                                 mbuf_outbound_finalize(m, PF_INET, 0);
1609
1610                                 /*
1611                                  * we need to call dlil_output to run filters
1612                                  * and resync to avoid recursion loops.
1613                                  */
1614                                 if (lo_ifp) {
1615                                         dlil_output(lo_ifp, PF_INET, m, NULL,
1616                                             SA(dst), 0, adv);
1617                                 } else {
1618                                         printf("%s: no loopback ifp for "
1619                                             "forwarding!!!\n", __func__);
1620                                 }
1621                                 goto done;
1622                         }
1623                         /*
1624                          * Some of the logic for this was nicked from above.
1625                          *
1626                          * This rewrites the cached route in a local PCB.
1627                          * Is this what we want to do?
1628                          */
1629                         ROUTE_RELEASE(ro_fwd);
1630                         bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst));
1631
1632                         rtalloc_ign(ro_fwd, RTF_PRCLONING);
1633
1634                         if (ro_fwd->ro_rt == NULL) {
1635                                 OSAddAtomic(1, &ipstat.ips_noroute);
1636                                 error = EHOSTUNREACH;
1637                                 goto bad;
1638                         }
1639
1640                         RT_LOCK_SPIN(ro_fwd->ro_rt);
1641                         ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1642                         if (ia_fw != NULL) {
1643                                 /* Become a regular mutex */
1644                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1645                                 IFA_ADDREF(&ia_fw->ia_ifa);
1646                         }
1647                         ifp = ro_fwd->ro_rt->rt_ifp;
1648                         ro_fwd->ro_rt->rt_use++;
1649                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1650                                 dst = SIN(ro_fwd->ro_rt->rt_gateway);
1651                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1652                                 /* double negation needed for bool bit field */
1653                                 ipobf.isbroadcast =
1654                                     !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1655                         } else {
1656                                 /* Become a regular mutex */
1657                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1658                                 ipobf.isbroadcast =
1659                                     in_broadcast(dst->sin_addr, ifp);
1660                         }
1661                         RT_UNLOCK(ro_fwd->ro_rt);
1662                         ROUTE_RELEASE(ro);
1663                         ro->ro_rt = ro_fwd->ro_rt;
1664                         ro_fwd->ro_rt = NULL;
1665                         dst = SIN(&ro_fwd->ro_dst);
1666
1667                         /*
1668                          * If we added a default src ip earlier,
1669                          * which would have been gotten from the-then
1670                          * interface, do it again, from the new one.
1671                          */
1672                         if (ia_fw != NULL) {
1673                                 if (ipobf.fwd_rewrite_src) {
1674                                         IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1675                                         ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1676                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1677                                 }
1678                                 IFA_REMREF(&ia_fw->ia_ifa);
1679                         }
1680                         goto pass;
1681                 }
1682 #endif /* IPFIREWALL_FORWARD */
1683                 /*
1684                  * if we get here, none of the above matches, and
1685                  * we have to drop the pkt
1686                  */
1687                 m_freem(m);
1688                 error = EACCES; /* not sure this is the right error msg */
1689                 goto done;
1690         }
1691
1692 pass:
1693 #endif /* IPFIREWALL */
1694
1695         /* 127/8 must not appear on wire - RFC1122 */
1696         if (!(ifp->if_flags & IFF_LOOPBACK) &&
1697             ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1698             (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1699                 OSAddAtomic(1, &ipstat.ips_badaddr);
1700                 m_freem(m);
1701                 error = EADDRNOTAVAIL;
1702                 goto done;
1703         }
1704
1705         ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1706             ip->ip_len, &sw_csum);
1707
1708         /*
1709          * If small enough for interface, or the interface will take
1710          * care of the fragmentation for us, can just send directly.
1711          */
1712         if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) ||
1713             (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1714 #if BYTE_ORDER != BIG_ENDIAN
1715                 HTONS(ip->ip_len);
1716                 HTONS(ip->ip_off);
1717 #endif
1718
1719                 ip->ip_sum = 0;
1720                 if (sw_csum & CSUM_DELAY_IP) {
1721                         ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1722                         sw_csum &= ~CSUM_DELAY_IP;
1723                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1724                 }
1725
1726 #if IPSEC
1727                 /* clean ipsec history once it goes out of the node */
1728                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC))
1729                         ipsec_delaux(m);
1730 #endif /* IPSEC */
1731                 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1732                     (m->m_pkthdr.tso_segsz > 0))
1733                         scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1734                 else
1735                         scnt++;
1736
1737                 if (packetchain == 0) {
1738                         if (ro->ro_rt != NULL && nstat_collect)
1739                                 nstat_route_tx(ro->ro_rt, scnt,
1740                                     m->m_pkthdr.len, 0);
1741
1742                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1743                             SA(dst), 0, adv);
1744                         scnt = 0;
1745                         goto done;
1746                 } else {
1747                         /*
1748                          * packet chaining allows us to reuse the
1749                          * route for all packets
1750                          */
1751                         bytecnt += m->m_pkthdr.len;
1752                         mppn = &m->m_nextpkt;
1753                         m = m->m_nextpkt;
1754                         if (m == NULL) {
1755 #if PF
1756 sendchain:
1757 #endif /* PF */
1758                                 if (pktcnt > ip_maxchainsent)
1759                                         ip_maxchainsent = pktcnt;
1760                                 if (ro->ro_rt != NULL && nstat_collect)
1761                                         nstat_route_tx(ro->ro_rt, scnt,
1762                                             bytecnt, 0);
1763
1764                                 error = dlil_output(ifp, PF_INET, packetlist,
1765                                     ro->ro_rt, SA(dst), 0, adv);
1766                                 pktcnt = 0;
1767                                 scnt = 0;
1768                                 bytecnt = 0;
1769                                 goto done;
1770
1771                         }
1772                         m0 = m;
1773                         pktcnt++;
1774                         goto loopit;
1775                 }
1776         }
1777         /*
1778          * Too large for interface; fragment if possible.
1779          * Must be able to put at least 8 bytes per fragment.
1780          * Balk when DF bit is set or the interface didn't support TSO.
1781          */
1782         if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1783             (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1784                 error = EMSGSIZE;
1785                 /*
1786                  * This case can happen if the user changed the MTU
1787                  * of an interface after enabling IP on it.  Because
1788                  * most netifs don't keep track of routes pointing to
1789                  * them, there is no way for one to update all its
1790                  * routes when the MTU is changed.
1791                  */
1792                 if (ro->ro_rt) {
1793                         RT_LOCK_SPIN(ro->ro_rt);
1794                         if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1795                             !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1796                             (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1797                                 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1798                         }
1799                         RT_UNLOCK(ro->ro_rt);
1800                 }
1801                 if (pktcnt > 0) {
1802                         m0 = packetlist;
1803                 }
1804                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1805                 goto bad;
1806         }
1807
1808         error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
1809         if (error != 0) {
1810                 m0 = m = NULL;
1811                 goto bad;
1812         }
1813
1814         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1815             ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1816
1817         for (m = m0; m; m = m0) {
1818                 m0 = m->m_nextpkt;
1819                 m->m_nextpkt = 0;
1820 #if IPSEC
1821                 /* clean ipsec history once it goes out of the node */
1822                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC))
1823                         ipsec_delaux(m);
1824 #endif /* IPSEC */
1825                 if (error == 0) {
1826                         if ((packetchain != 0) && (pktcnt > 0)) {
1827                                 panic("%s: mix of packet in packetlist is "
1828                                     "wrong=%p", __func__, packetlist);
1829                                 /* NOTREACHED */
1830                         }
1831                         if (ro->ro_rt != NULL && nstat_collect) {
1832                                 nstat_route_tx(ro->ro_rt, 1,
1833                                     m->m_pkthdr.len, 0);
1834                         }
1835                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1836                             SA(dst), 0, adv);
1837                 } else {
1838                         m_freem(m);
1839                 }
1840         }
1841
1842         if (error == 0)
1843                 OSAddAtomic(1, &ipstat.ips_fragmented);
1844
1845 done:
1846         if (ia != NULL) {
1847                 IFA_REMREF(&ia->ia_ifa);
1848                 ia = NULL;
1849         }
1850 #if IPSEC
1851         ROUTE_RELEASE(&ipsec_state.ro);
1852         if (sp != NULL) {
1853                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1854                     printf("DP ip_output call free SP:%x\n", sp));
1855                 key_freesp(sp, KEY_SADB_UNLOCKED);
1856         }
1857 #endif /* IPSEC */
1858 #if DUMMYNET
1859         ROUTE_RELEASE(&saved_route);
1860 #endif /* DUMMYNET */
1861 #if IPFIREWALL_FORWARD
1862         ROUTE_RELEASE(&sro_fwd);
1863 #endif /* IPFIREWALL_FORWARD */
1864
1865         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
1866         return (error);
1867 bad:
1868         m_freem(m0);
1869         goto done;
1870
1871 #undef ipsec_state
1872 #undef args
1873 #undef sro_fwd
1874 #undef saved_route
1875 #undef ipf_pktopts
1876 }
1877
1878 int
1879 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
1880 {
1881         struct ip *ip, *mhip;
1882         int len, hlen, mhlen, firstlen, off, error = 0;
1883         struct mbuf **mnext = &m->m_nextpkt, *m0;
1884         int nfrags = 1;
1885
1886         ip = mtod(m, struct ip *);
1887 #ifdef _IP_VHL
1888         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1889 #else /* !_IP_VHL */
1890         hlen = ip->ip_hl << 2;
1891 #endif /* !_IP_VHL */
1892
1893         firstlen = len = (mtu - hlen) &~ 7;
1894         if (len < 8) {
1895                 m_freem(m);
1896                 return (EMSGSIZE);
1897         }
1898
1899         /*
1900          * if the interface will not calculate checksums on
1901          * fragmented packets, then do it here.
1902          */
1903         if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
1904             !(ifp->if_hwassist & CSUM_IP_FRAGS))
1905                 in_delayed_cksum(m);
1906
1907         /*
1908          * Loop through length of segment after first fragment,
1909          * make new header and copy data of each part and link onto chain.
1910          */
1911         m0 = m;
1912         mhlen = sizeof (struct ip);
1913         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1914                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1915                 if (m == NULL) {
1916                         error = ENOBUFS;
1917                         OSAddAtomic(1, &ipstat.ips_odropped);
1918                         goto sendorfree;
1919                 }
1920                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1921                 m->m_data += max_linkhdr;
1922                 mhip = mtod(m, struct ip *);
1923                 *mhip = *ip;
1924                 if (hlen > sizeof (struct ip)) {
1925                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1926                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1927                 }
1928                 m->m_len = mhlen;
1929                 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
1930                 if (ip->ip_off & IP_MF)
1931                         mhip->ip_off |= IP_MF;
1932                 if (off + len >= (u_short)ip->ip_len)
1933                         len = (u_short)ip->ip_len - off;
1934                 else
1935                         mhip->ip_off |= IP_MF;
1936                 mhip->ip_len = htons((u_short)(len + mhlen));
1937                 m->m_next = m_copy(m0, off, len);
1938                 if (m->m_next == NULL) {
1939                         (void) m_free(m);
1940                         error = ENOBUFS;        /* ??? */
1941                         OSAddAtomic(1, &ipstat.ips_odropped);
1942                         goto sendorfree;
1943                 }
1944                 m->m_pkthdr.len = mhlen + len;
1945                 m->m_pkthdr.rcvif = NULL;
1946                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1947
1948                 M_COPY_CLASSIFIER(m, m0);
1949                 M_COPY_PFTAG(m, m0);
1950
1951 #if CONFIG_MACF_NET
1952                 mac_netinet_fragment(m0, m);
1953 #endif /* CONFIG_MACF_NET */
1954
1955 #if BYTE_ORDER != BIG_ENDIAN
1956                 HTONS(mhip->ip_off);
1957 #endif
1958
1959                 mhip->ip_sum = 0;
1960                 if (sw_csum & CSUM_DELAY_IP) {
1961                         mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
1962                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1963                 }
1964                 *mnext = m;
1965                 mnext = &m->m_nextpkt;
1966                 nfrags++;
1967         }
1968         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1969
1970         /* set first/last markers for fragment chain */
1971         m->m_flags |= M_LASTFRAG;
1972         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1973         m0->m_pkthdr.csum_data = nfrags;
1974
1975         /*
1976          * Update first fragment by trimming what's been copied out
1977          * and updating header, then send each fragment (in order).
1978          */
1979         m = m0;
1980         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1981         m->m_pkthdr.len = hlen + firstlen;
1982         ip->ip_len = htons((u_short)m->m_pkthdr.len);
1983         ip->ip_off |= IP_MF;
1984
1985 #if BYTE_ORDER != BIG_ENDIAN
1986         HTONS(ip->ip_off);
1987 #endif
1988
1989         ip->ip_sum = 0;
1990         if (sw_csum & CSUM_DELAY_IP) {
1991                 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1992                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1993         }
1994 sendorfree:
1995         if (error)
1996                 m_freem_list(m0);
1997
1998         return (error);
1999 }
2000
2001 static void
2002 ip_out_cksum_stats(int proto, u_int32_t len)
2003 {
2004         switch (proto) {
2005         case IPPROTO_TCP:
2006                 tcp_out_cksum_stats(len);
2007                 break;
2008         case IPPROTO_UDP:
2009                 udp_out_cksum_stats(len);
2010                 break;
2011         default:
2012                 /* keep only TCP or UDP stats for now */
2013                 break;
2014         }
2015 }
2016
2017 /*
2018  * Process a delayed payload checksum calculation (outbound path.)
2019  *
2020  * hoff is the number of bytes beyond the mbuf data pointer which
2021  * points to the IP header.
2022  *
2023  * Returns a bitmask representing all the work done in software.
2024  */
2025 uint32_t
2026 in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
2027 {
2028         unsigned char buf[15 << 2] __attribute__((aligned(8)));
2029         struct ip *ip;
2030         uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
2031         uint16_t csum, ip_len;
2032
2033         _CASSERT(sizeof (csum) == sizeof (uint16_t));
2034         VERIFY(m->m_flags & M_PKTHDR);
2035
2036         sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
2037
2038         if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0)
2039                 goto done;
2040
2041         mlen = m->m_pkthdr.len;                         /* total mbuf len */
2042
2043         /* sanity check (need at least simple IP header) */
2044         if (mlen < (hoff + sizeof (*ip))) {
2045                 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
2046                     "(%u+%u)\n", __func__, m, mlen, hoff,
2047                     (uint32_t)sizeof (*ip));
2048                 /* NOTREACHED */
2049         }
2050
2051         /*
2052          * In case the IP header is not contiguous, or not 32-bit aligned,
2053          * or if we're computing the IP header checksum, copy it to a local
2054          * buffer.  Copy only the simple IP header here (IP options case
2055          * is handled below.)
2056          */
2057         if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof (*ip)) > m->m_len ||
2058             !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2059                 m_copydata(m, hoff, sizeof (*ip), (caddr_t)buf);
2060                 ip = (struct ip *)(void *)buf;
2061                 _hlen = sizeof (*ip);
2062         } else {
2063                 ip = (struct ip *)(void *)(m->m_data + hoff);
2064                 _hlen = 0;
2065         }
2066
2067         hlen = IP_VHL_HL(ip->ip_vhl) << 2;              /* IP header len */
2068
2069         /* sanity check */
2070         if (mlen < (hoff + hlen)) {
2071                 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2072                     "hoff %u", __func__, m, mlen, hlen, hoff);
2073                 /* NOTREACHED */
2074         }
2075
2076         /*
2077          * We could be in the context of an IP or interface filter; in the
2078          * former case, ip_len would be in host (correct) order while for
2079          * the latter it would be in network order.  Because of this, we
2080          * attempt to interpret the length field by comparing it against
2081          * the actual packet length.  If the comparison fails, byte swap
2082          * the length and check again.  If it still fails, use the actual
2083          * packet length.  This also covers the trailing bytes case.
2084          */
2085         ip_len = ip->ip_len;
2086         if (ip_len != (mlen - hoff)) {
2087                 ip_len = OSSwapInt16(ip_len);
2088                 if (ip_len != (mlen - hoff)) {
2089                         printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2090                             "[swapped %d (%x)] doesn't match actual packet "
2091                             "length; %d is used instead\n", __func__,
2092                             (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p,
2093                             ip->ip_len, ip->ip_len, ip_len, ip_len,
2094                             (mlen - hoff));
2095                         ip_len = mlen - hoff;
2096                 }
2097         }
2098
2099         len = ip_len - hlen;                            /* csum span */
2100
2101         if (sw_csum & CSUM_DELAY_DATA) {
2102                 uint16_t ulpoff;
2103
2104                 /*
2105                  * offset is added to the lower 16-bit value of csum_data,
2106                  * which is expected to contain the ULP offset; therefore
2107                  * CSUM_PARTIAL offset adjustment must be undone.
2108                  */
2109                 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL|CSUM_DATA_VALID)) ==
2110                     (CSUM_PARTIAL|CSUM_DATA_VALID)) {
2111                         /*
2112                          * Get back the original ULP offset (this will
2113                          * undo the CSUM_PARTIAL logic in ip_output.)
2114                          */
2115                         m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2116                             m->m_pkthdr.csum_tx_start);
2117                 }
2118
2119                 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2120                 offset = hoff + hlen;                   /* ULP header */
2121
2122                 if (mlen < (ulpoff + sizeof (csum))) {
2123                         panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2124                             "cksum offset (%u) cksum flags 0x%x\n", __func__,
2125                             m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2126                         /* NOTREACHED */
2127                 }
2128
2129                 csum = inet_cksum(m, 0, offset, len);
2130
2131                 /* Update stats */
2132                 ip_out_cksum_stats(ip->ip_p, len);
2133
2134                 /* RFC1122 4.1.3.4 */
2135                 if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP))
2136                         csum = 0xffff;
2137
2138                 /* Insert the checksum in the ULP csum field */
2139                 offset += ulpoff;
2140                 if (offset + sizeof (csum) > m->m_len) {
2141                         m_copyback(m, offset, sizeof (csum), &csum);
2142                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2143                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2144                 } else {
2145                         bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
2146                 }
2147                 m->m_pkthdr.csum_flags &=
2148                     ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL);
2149         }
2150
2151         if (sw_csum & CSUM_DELAY_IP) {
2152                 /* IP header must be in the local buffer */
2153                 VERIFY(_hlen == sizeof (*ip));
2154                 if (_hlen != hlen) {
2155                         VERIFY(hlen <= sizeof (buf));
2156                         m_copydata(m, hoff, hlen, (caddr_t)buf);
2157                         ip = (struct ip *)(void *)buf;
2158                         _hlen = hlen;
2159                 }
2160
2161                 /*
2162                  * Compute the IP header checksum as if the IP length
2163                  * is the length which we believe is "correct"; see
2164                  * how ip_len gets calculated above.  Note that this
2165                  * is done on the local copy and not on the real one.
2166                  */
2167                 ip->ip_len = htons(ip_len);
2168                 ip->ip_sum = 0;
2169                 csum = in_cksum_hdr_opt(ip);
2170
2171                 /* Update stats */
2172                 ipstat.ips_snd_swcsum++;
2173                 ipstat.ips_snd_swcsum_bytes += hlen;
2174
2175                 /*
2176                  * Insert only the checksum in the existing IP header
2177                  * csum field; all other fields are left unchanged.
2178                  */
2179                 offset = hoff + offsetof(struct ip, ip_sum);
2180                 if (offset + sizeof (csum) > m->m_len) {
2181                         m_copyback(m, offset, sizeof (csum), &csum);
2182                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2183                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2184                 } else {
2185                         bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
2186                 }
2187                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2188         }
2189
2190 done:
2191         return (sw_csum);
2192 }
2193
2194 /*
2195  * Insert IP options into preformed packet.
2196  * Adjust IP destination as required for IP source routing,
2197  * as indicated by a non-zero in_addr at the start of the options.
2198  *
2199  * XXX This routine assumes that the packet has no options in place.
2200  */
2201 static struct mbuf *
2202 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2203 {
2204         struct ipoption *p = mtod(opt, struct ipoption *);
2205         struct mbuf *n;
2206         struct ip *ip = mtod(m, struct ip *);
2207         unsigned optlen;
2208
2209         optlen = opt->m_len - sizeof (p->ipopt_dst);
2210         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
2211                 return (m);             /* XXX should fail */
2212         if (p->ipopt_dst.s_addr)
2213                 ip->ip_dst = p->ipopt_dst;
2214         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2215                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2216                 if (n == NULL)
2217                         return (m);
2218                 n->m_pkthdr.rcvif = 0;
2219 #if CONFIG_MACF_NET
2220                 mac_mbuf_label_copy(m, n);
2221 #endif /* CONFIG_MACF_NET */
2222                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2223                 m->m_len -= sizeof (struct ip);
2224                 m->m_data += sizeof (struct ip);
2225                 n->m_next = m;
2226                 m = n;
2227                 m->m_len = optlen + sizeof (struct ip);
2228                 m->m_data += max_linkhdr;
2229                 (void) memcpy(mtod(m, void *), ip, sizeof (struct ip));
2230         } else {
2231                 m->m_data -= optlen;
2232                 m->m_len += optlen;
2233                 m->m_pkthdr.len += optlen;
2234                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof (struct ip));
2235         }
2236         ip = mtod(m, struct ip *);
2237         bcopy(p->ipopt_list, ip + 1, optlen);
2238         *phlen = sizeof (struct ip) + optlen;
2239         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2240         ip->ip_len += optlen;
2241         return (m);
2242 }
2243
2244 /*
2245  * Copy options from ip to jp,
2246  * omitting those not copied during fragmentation.
2247  */
2248 static int
2249 ip_optcopy(struct ip *ip, struct ip *jp)
2250 {
2251         u_char *cp, *dp;
2252         int opt, optlen, cnt;
2253
2254         cp = (u_char *)(ip + 1);
2255         dp = (u_char *)(jp + 1);
2256         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2257         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2258                 opt = cp[0];
2259                 if (opt == IPOPT_EOL)
2260                         break;
2261                 if (opt == IPOPT_NOP) {
2262                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2263                         *dp++ = IPOPT_NOP;
2264                         optlen = 1;
2265                         continue;
2266                 }
2267 #if DIAGNOSTIC
2268                 if (cnt < IPOPT_OLEN + sizeof (*cp)) {
2269                         panic("malformed IPv4 option passed to ip_optcopy");
2270                         /* NOTREACHED */
2271                 }
2272 #endif
2273                 optlen = cp[IPOPT_OLEN];
2274 #if DIAGNOSTIC
2275                 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) {
2276                         panic("malformed IPv4 option passed to ip_optcopy");
2277                         /* NOTREACHED */
2278                 }
2279 #endif
2280                 /* bogus lengths should have been caught by ip_dooptions */
2281                 if (optlen > cnt)
2282                         optlen = cnt;
2283                 if (IPOPT_COPIED(opt)) {
2284                         bcopy(cp, dp, optlen);
2285                         dp += optlen;
2286                 }
2287         }
2288         for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2289                 *dp++ = IPOPT_EOL;
2290         return (optlen);
2291 }
2292
2293 /*
2294  * IP socket option processing.
2295  */
2296 int
2297 ip_ctloutput(struct socket *so, struct sockopt *sopt)
2298 {
2299         struct  inpcb *inp = sotoinpcb(so);
2300         int     error, optval;
2301
2302         error = optval = 0;
2303         if (sopt->sopt_level != IPPROTO_IP)
2304                 return (EINVAL);
2305
2306         switch (sopt->sopt_dir) {
2307         case SOPT_SET:
2308                 switch (sopt->sopt_name) {
2309 #ifdef notyet
2310                 case IP_RETOPTS:
2311 #endif
2312                 case IP_OPTIONS: {
2313                         struct mbuf *m;
2314
2315                         if (sopt->sopt_valsize > MLEN) {
2316                                 error = EMSGSIZE;
2317                                 break;
2318                         }
2319                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2320                             MT_HEADER);
2321                         if (m == NULL) {
2322                                 error = ENOBUFS;
2323                                 break;
2324                         }
2325                         m->m_len = sopt->sopt_valsize;
2326                         error = sooptcopyin(sopt, mtod(m, char *),
2327                             m->m_len, m->m_len);
2328                         if (error)
2329                                 break;
2330
2331                         return (ip_pcbopts(sopt->sopt_name,
2332                             &inp->inp_options, m));
2333                 }
2334
2335                 case IP_TOS:
2336                 case IP_TTL:
2337                 case IP_RECVOPTS:
2338                 case IP_RECVRETOPTS:
2339                 case IP_RECVDSTADDR:
2340                 case IP_RECVIF:
2341                 case IP_RECVTTL:
2342                 case IP_RECVPKTINFO:
2343                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2344                             sizeof (optval));
2345                         if (error)
2346                                 break;
2347
2348                         switch (sopt->sopt_name) {
2349                         case IP_TOS:
2350                                 inp->inp_ip_tos = optval;
2351                                 break;
2352
2353                         case IP_TTL:
2354                                 inp->inp_ip_ttl = optval;
2355                                 break;
2356 #define OPTSET(bit) \
2357         if (optval) \
2358                 inp->inp_flags |= bit; \
2359         else \
2360                 inp->inp_flags &= ~bit;
2361
2362                         case IP_RECVOPTS:
2363                                 OPTSET(INP_RECVOPTS);
2364                                 break;
2365
2366                         case IP_RECVRETOPTS:
2367                                 OPTSET(INP_RECVRETOPTS);
2368                                 break;
2369
2370                         case IP_RECVDSTADDR:
2371                                 OPTSET(INP_RECVDSTADDR);
2372                                 break;
2373
2374                         case IP_RECVIF:
2375                                 OPTSET(INP_RECVIF);
2376                                 break;
2377
2378                         case IP_RECVTTL:
2379                                 OPTSET(INP_RECVTTL);
2380                                 break;
2381
2382                         case IP_RECVPKTINFO:
2383                                 OPTSET(INP_PKTINFO);
2384                                 break;
2385                         }
2386                         break;
2387 #undef OPTSET
2388
2389 #if CONFIG_FORCE_OUT_IFP
2390                 /*
2391                  * Apple private interface, similar to IP_BOUND_IF, except
2392                  * that the parameter is a NULL-terminated string containing
2393                  * the name of the network interface; an emptry string means
2394                  * unbind.  Applications are encouraged to use IP_BOUND_IF
2395                  * instead, as that is the current "official" API.
2396                  */
2397                 case IP_FORCE_OUT_IFP: {
2398                         char ifname[IFNAMSIZ];
2399                         unsigned int ifscope;
2400
2401                         /* This option is settable only for IPv4 */
2402                         if (!(inp->inp_vflag & INP_IPV4)) {
2403                                 error = EINVAL;
2404                                 break;
2405                         }
2406
2407                         /* Verify interface name parameter is sane */
2408                         if (sopt->sopt_valsize > sizeof (ifname)) {
2409                                 error = EINVAL;
2410                                 break;
2411                         }
2412
2413                         /* Copy the interface name */
2414                         if (sopt->sopt_valsize != 0) {
2415                                 error = sooptcopyin(sopt, ifname,
2416                                     sizeof (ifname), sopt->sopt_valsize);
2417                                 if (error)
2418                                         break;
2419                         }
2420
2421                         if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
2422                                 /* Unbind this socket from any interface */
2423                                 ifscope = IFSCOPE_NONE;
2424                         } else {
2425                                 ifnet_t ifp;
2426
2427                                 /* Verify name is NULL terminated */
2428                                 if (ifname[sopt->sopt_valsize - 1] != '\0') {
2429                                         error = EINVAL;
2430                                         break;
2431                                 }
2432
2433                                 /* Bail out if given bogus interface name */
2434                                 if (ifnet_find_by_name(ifname, &ifp) != 0) {
2435                                         error = ENXIO;
2436                                         break;
2437                                 }
2438
2439                                 /* Bind this socket to this interface */
2440                                 ifscope = ifp->if_index;
2441
2442                                 /*
2443                                  * Won't actually free; since we don't release
2444                                  * this later, we should do it now.
2445                                  */
2446                                 ifnet_release(ifp);
2447                         }
2448                         error = inp_bindif(inp, ifscope, NULL);
2449                 }
2450                 break;
2451 #endif /* CONFIG_FORCE_OUT_IFP */
2452                 /*
2453                  * Multicast socket options are processed by the in_mcast
2454                  * module.
2455                  */
2456                 case IP_MULTICAST_IF:
2457                 case IP_MULTICAST_IFINDEX:
2458                 case IP_MULTICAST_VIF:
2459                 case IP_MULTICAST_TTL:
2460                 case IP_MULTICAST_LOOP:
2461                 case IP_ADD_MEMBERSHIP:
2462                 case IP_DROP_MEMBERSHIP:
2463                 case IP_ADD_SOURCE_MEMBERSHIP:
2464                 case IP_DROP_SOURCE_MEMBERSHIP:
2465                 case IP_BLOCK_SOURCE:
2466                 case IP_UNBLOCK_SOURCE:
2467                 case IP_MSFILTER:
2468                 case MCAST_JOIN_GROUP:
2469                 case MCAST_LEAVE_GROUP:
2470                 case MCAST_JOIN_SOURCE_GROUP:
2471                 case MCAST_LEAVE_SOURCE_GROUP:
2472                 case MCAST_BLOCK_SOURCE:
2473                 case MCAST_UNBLOCK_SOURCE:
2474                         error = inp_setmoptions(inp, sopt);
2475                         break;
2476
2477                 case IP_PORTRANGE:
2478                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2479                             sizeof (optval));
2480                         if (error)
2481                                 break;
2482
2483                         switch (optval) {
2484                         case IP_PORTRANGE_DEFAULT:
2485                                 inp->inp_flags &= ~(INP_LOWPORT);
2486                                 inp->inp_flags &= ~(INP_HIGHPORT);
2487                                 break;
2488
2489                         case IP_PORTRANGE_HIGH:
2490                                 inp->inp_flags &= ~(INP_LOWPORT);
2491                                 inp->inp_flags |= INP_HIGHPORT;
2492                                 break;
2493
2494                         case IP_PORTRANGE_LOW:
2495                                 inp->inp_flags &= ~(INP_HIGHPORT);
2496                                 inp->inp_flags |= INP_LOWPORT;
2497                                 break;
2498
2499                         default:
2500                                 error = EINVAL;
2501                                 break;
2502                         }
2503                         break;
2504
2505 #if IPSEC
2506                 case IP_IPSEC_POLICY: {
2507                         caddr_t req = NULL;
2508                         size_t len = 0;
2509                         int priv;
2510                         struct mbuf *m;
2511                         int optname;
2512
2513                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2514                                 break;
2515                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2516                                 break;
2517                         priv = (proc_suser(sopt->sopt_p) == 0);
2518                         if (m) {
2519                                 req = mtod(m, caddr_t);
2520                                 len = m->m_len;
2521                         }
2522                         optname = sopt->sopt_name;
2523                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2524                         m_freem(m);
2525                         break;
2526                 }
2527 #endif /* IPSEC */
2528
2529 #if TRAFFIC_MGT
2530                 case IP_TRAFFIC_MGT_BACKGROUND: {
2531                         unsigned background = 0;
2532
2533                         error = sooptcopyin(sopt, &background,
2534                             sizeof (background), sizeof (background));
2535                         if (error)
2536                                 break;
2537
2538                         if (background) {
2539                                 socket_set_traffic_mgt_flags_locked(so,
2540                                     TRAFFIC_MGT_SO_BACKGROUND);
2541                         } else {
2542                                 socket_clear_traffic_mgt_flags_locked(so,
2543                                     TRAFFIC_MGT_SO_BACKGROUND);
2544                         }
2545
2546                         break;
2547                 }
2548 #endif /* TRAFFIC_MGT */
2549
2550                 /*
2551                  * On a multihomed system, scoped routing can be used to
2552                  * restrict the source interface used for sending packets.
2553                  * The socket option IP_BOUND_IF binds a particular AF_INET
2554                  * socket to an interface such that data sent on the socket
2555                  * is restricted to that interface.  This is unlike the
2556                  * SO_DONTROUTE option where the routing table is bypassed;
2557                  * therefore it allows for a greater flexibility and control
2558                  * over the system behavior, and does not place any restriction
2559                  * on the destination address type (e.g.  unicast, multicast,
2560                  * or broadcast if applicable) or whether or not the host is
2561                  * directly reachable.  Note that in the multicast transmit
2562                  * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2563                  * IP_BOUND_IF, since the former practically bypasses the
2564                  * routing table; in this case, IP_BOUND_IF sets the default
2565                  * interface used for sending multicast packets in the absence
2566                  * of an explicit multicast transmit interface.
2567                  */
2568                 case IP_BOUND_IF:
2569                         /* This option is settable only for IPv4 */
2570                         if (!(inp->inp_vflag & INP_IPV4)) {
2571                                 error = EINVAL;
2572                                 break;
2573                         }
2574
2575                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2576                             sizeof (optval));
2577
2578                         if (error)
2579                                 break;
2580
2581                         error = inp_bindif(inp, optval, NULL);
2582                         break;
2583
2584                 case IP_NO_IFT_CELLULAR:
2585                         /* This option is settable only for IPv4 */
2586                         if (!(inp->inp_vflag & INP_IPV4)) {
2587                                 error = EINVAL;
2588                                 break;
2589                         }
2590
2591                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2592                             sizeof (optval));
2593
2594                         if (error)
2595                                 break;
2596
2597                         /* once set, it cannot be unset */
2598                         if (!optval && (inp->inp_flags & INP_NO_IFT_CELLULAR)) {
2599                                 error = EINVAL;
2600                                 break;
2601                         }
2602
2603                         error = so_set_restrictions(so,
2604                             SO_RESTRICT_DENY_CELLULAR);
2605                         break;
2606
2607                 case IP_OUT_IF:
2608                         /* This option is not settable */
2609                         error = EINVAL;
2610                         break;
2611
2612                 default:
2613                         error = ENOPROTOOPT;
2614                         break;
2615                 }
2616                 break;
2617
2618         case SOPT_GET:
2619                 switch (sopt->sopt_name) {
2620                 case IP_OPTIONS:
2621                 case IP_RETOPTS:
2622                         if (inp->inp_options) {
2623                                 error = sooptcopyout(sopt,
2624                                     mtod(inp->inp_options, char *),
2625                                     inp->inp_options->m_len);
2626                         } else {
2627                                 sopt->sopt_valsize = 0;
2628                         }
2629                         break;
2630
2631                 case IP_TOS:
2632                 case IP_TTL:
2633                 case IP_RECVOPTS:
2634                 case IP_RECVRETOPTS:
2635                 case IP_RECVDSTADDR:
2636                 case IP_RECVIF:
2637                 case IP_RECVTTL:
2638                 case IP_PORTRANGE:
2639                 case IP_RECVPKTINFO:
2640                         switch (sopt->sopt_name) {
2641
2642                         case IP_TOS:
2643                                 optval = inp->inp_ip_tos;
2644                                 break;
2645
2646                         case IP_TTL:
2647                                 optval = inp->inp_ip_ttl;
2648                                 break;
2649
2650 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2651
2652                         case IP_RECVOPTS:
2653                                 optval = OPTBIT(INP_RECVOPTS);
2654                                 break;
2655
2656                         case IP_RECVRETOPTS:
2657                                 optval = OPTBIT(INP_RECVRETOPTS);
2658                                 break;
2659
2660                         case IP_RECVDSTADDR:
2661                                 optval = OPTBIT(INP_RECVDSTADDR);
2662                                 break;
2663
2664                         case IP_RECVIF:
2665                                 optval = OPTBIT(INP_RECVIF);
2666                                 break;
2667
2668                         case IP_RECVTTL:
2669                                 optval = OPTBIT(INP_RECVTTL);
2670                                 break;
2671
2672                         case IP_PORTRANGE:
2673                                 if (inp->inp_flags & INP_HIGHPORT)
2674                                         optval = IP_PORTRANGE_HIGH;
2675                                 else if (inp->inp_flags & INP_LOWPORT)
2676                                         optval = IP_PORTRANGE_LOW;
2677                                 else
2678                                         optval = 0;
2679                                 break;
2680
2681                         case IP_RECVPKTINFO:
2682                                 optval = OPTBIT(INP_PKTINFO);
2683                                 break;
2684                         }
2685                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2686                         break;
2687
2688                 case IP_MULTICAST_IF:
2689                 case IP_MULTICAST_IFINDEX:
2690                 case IP_MULTICAST_VIF:
2691                 case IP_MULTICAST_TTL:
2692                 case IP_MULTICAST_LOOP:
2693                 case IP_MSFILTER:
2694                         error = inp_getmoptions(inp, sopt);
2695                         break;
2696
2697 #if IPSEC
2698                 case IP_IPSEC_POLICY: {
2699                         struct mbuf *m = NULL;
2700                         caddr_t req = NULL;
2701                         size_t len = 0;
2702
2703                         if (m != NULL) {
2704                                 req = mtod(m, caddr_t);
2705                                 len = m->m_len;
2706                         }
2707                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
2708                         if (error == 0)
2709                                 error = soopt_mcopyout(sopt, m); /* XXX */
2710                         if (error == 0)
2711                                 m_freem(m);
2712                         break;
2713                 }
2714 #endif /* IPSEC */
2715
2716 #if TRAFFIC_MGT
2717                 case IP_TRAFFIC_MGT_BACKGROUND: {
2718                         unsigned background = (so->so_traffic_mgt_flags &
2719                             TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2720                         return (sooptcopyout(sopt, &background,
2721                             sizeof (background)));
2722                         break;
2723                 }
2724 #endif /* TRAFFIC_MGT */
2725
2726                 case IP_BOUND_IF:
2727                         if (inp->inp_flags & INP_BOUND_IF)
2728                                 optval = inp->inp_boundifp->if_index;
2729                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2730                         break;
2731
2732                 case IP_NO_IFT_CELLULAR:
2733                         optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
2734                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2735                         break;
2736
2737                 case IP_OUT_IF:
2738                         optval = (inp->inp_last_outifp != NULL) ?
2739                             inp->inp_last_outifp->if_index : 0;
2740                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2741                         break;
2742
2743                 default:
2744                         error = ENOPROTOOPT;
2745                         break;
2746                 }
2747                 break;
2748         }
2749         return (error);
2750 }
2751
2752 /*
2753  * Set up IP options in pcb for insertion in output packets.
2754  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2755  * with destination address if source routed.
2756  */
2757 static int
2758 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2759 {
2760 #pragma unused(optname)
2761         int cnt, optlen;
2762         u_char *cp;
2763         u_char opt;
2764
2765         /* turn off any old options */
2766         if (*pcbopt)
2767                 (void) m_free(*pcbopt);
2768         *pcbopt = 0;
2769         if (m == (struct mbuf *)0 || m->m_len == 0) {
2770                 /*
2771                  * Only turning off any previous options.
2772                  */
2773                 if (m)
2774                         (void) m_free(m);
2775                 return (0);
2776         }
2777
2778         if (m->m_len % sizeof (int32_t))
2779                 goto bad;
2780
2781         /*
2782          * IP first-hop destination address will be stored before
2783          * actual options; move other options back
2784          * and clear it when none present.
2785          */
2786         if (m->m_data + m->m_len + sizeof (struct in_addr) >= &m->m_dat[MLEN])
2787                 goto bad;
2788         cnt = m->m_len;
2789         m->m_len += sizeof (struct in_addr);
2790         cp = mtod(m, u_char *) + sizeof (struct in_addr);
2791         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2792         bzero(mtod(m, caddr_t), sizeof (struct in_addr));
2793
2794         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2795                 opt = cp[IPOPT_OPTVAL];
2796                 if (opt == IPOPT_EOL)
2797                         break;
2798                 if (opt == IPOPT_NOP)
2799                         optlen = 1;
2800                 else {
2801                         if (cnt < IPOPT_OLEN + sizeof (*cp))
2802                                 goto bad;
2803                         optlen = cp[IPOPT_OLEN];
2804                         if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt)
2805                                 goto bad;
2806                 }
2807                 switch (opt) {
2808
2809                 default:
2810                         break;
2811
2812                 case IPOPT_LSRR:
2813                 case IPOPT_SSRR:
2814                         /*
2815                          * user process specifies route as:
2816                          *      ->A->B->C->D
2817                          * D must be our final destination (but we can't
2818                          * check that since we may not have connected yet).
2819                          * A is first hop destination, which doesn't appear in
2820                          * actual IP option, but is stored before the options.
2821                          */
2822                         if (optlen < IPOPT_MINOFF - 1 + sizeof (struct in_addr))
2823                                 goto bad;
2824                         m->m_len -= sizeof (struct in_addr);
2825                         cnt -= sizeof (struct in_addr);
2826                         optlen -= sizeof (struct in_addr);
2827                         cp[IPOPT_OLEN] = optlen;
2828                         /*
2829                          * Move first hop before start of options.
2830                          */
2831                         bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2832                             sizeof (struct in_addr));
2833                         /*
2834                          * Then copy rest of options back
2835                          * to close up the deleted entry.
2836                          */
2837                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2838                             sizeof (struct in_addr)),
2839                             (caddr_t)&cp[IPOPT_OFFSET+1],
2840                             (unsigned)cnt + sizeof (struct in_addr));
2841                         break;
2842                 }
2843         }
2844         if (m->m_len > MAX_IPOPTLEN + sizeof (struct in_addr))
2845                 goto bad;
2846         *pcbopt = m;
2847         return (0);
2848
2849 bad:
2850         (void) m_free(m);
2851         return (EINVAL);
2852 }
2853
2854 void
2855 ip_moptions_init(void)
2856 {
2857         PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug));
2858
2859         imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) :
2860             sizeof (struct ip_moptions_dbg);
2861
2862         imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
2863             IMO_ZONE_NAME);
2864         if (imo_zone == NULL) {
2865                 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
2866                 /* NOTREACHED */
2867         }
2868         zone_change(imo_zone, Z_EXPAND, TRUE);
2869 }
2870
2871 void
2872 imo_addref(struct ip_moptions *imo, int locked)
2873 {
2874         if (!locked)
2875                 IMO_LOCK(imo);
2876         else
2877                 IMO_LOCK_ASSERT_HELD(imo);
2878
2879         if (++imo->imo_refcnt == 0) {
2880                 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
2881                 /* NOTREACHED */
2882         } else if (imo->imo_trace != NULL) {
2883                 (*imo->imo_trace)(imo, TRUE);
2884         }
2885
2886         if (!locked)
2887                 IMO_UNLOCK(imo);
2888 }
2889
2890 void
2891 imo_remref(struct ip_moptions *imo)
2892 {
2893         int i;
2894
2895         IMO_LOCK(imo);
2896         if (imo->imo_refcnt == 0) {
2897                 panic("%s: imo %p negative refcnt", __func__, imo);
2898                 /* NOTREACHED */
2899         } else if (imo->imo_trace != NULL) {
2900                 (*imo->imo_trace)(imo, FALSE);
2901         }
2902
2903         --imo->imo_refcnt;
2904         if (imo->imo_refcnt > 0) {
2905                 IMO_UNLOCK(imo);
2906                 return;
2907         }
2908
2909         for (i = 0; i < imo->imo_num_memberships; ++i) {
2910                 struct in_mfilter *imf;
2911
2912                 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
2913                 if (imf != NULL)
2914                         imf_leave(imf);
2915
2916                 (void) in_leavegroup(imo->imo_membership[i], imf);
2917
2918                 if (imf != NULL)
2919                         imf_purge(imf);
2920
2921                 INM_REMREF(imo->imo_membership[i]);
2922                 imo->imo_membership[i] = NULL;
2923         }
2924         imo->imo_num_memberships = 0;
2925         if (imo->imo_mfilters != NULL) {
2926                 FREE(imo->imo_mfilters, M_INMFILTER);
2927                 imo->imo_mfilters = NULL;
2928         }
2929         if (imo->imo_membership != NULL) {
2930                 FREE(imo->imo_membership, M_IPMOPTS);
2931                 imo->imo_membership = NULL;
2932         }
2933         IMO_UNLOCK(imo);
2934
2935         lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
2936
2937         if (!(imo->imo_debug & IFD_ALLOC)) {
2938                 panic("%s: imo %p cannot be freed", __func__, imo);
2939                 /* NOTREACHED */
2940         }
2941         zfree(imo_zone, imo);
2942 }
2943
2944 static void
2945 imo_trace(struct ip_moptions *imo, int refhold)
2946 {
2947         struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2948         ctrace_t *tr;
2949         u_int32_t idx;
2950         u_int16_t *cnt;
2951
2952         if (!(imo->imo_debug & IFD_DEBUG)) {
2953                 panic("%s: imo %p has no debug structure", __func__, imo);
2954                 /* NOTREACHED */
2955         }
2956         if (refhold) {
2957                 cnt = &imo_dbg->imo_refhold_cnt;
2958                 tr = imo_dbg->imo_refhold;
2959         } else {
2960                 cnt = &imo_dbg->imo_refrele_cnt;
2961                 tr = imo_dbg->imo_refrele;
2962         }
2963
2964         idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
2965         ctrace_record(&tr[idx]);
2966 }
2967
2968 struct ip_moptions *
2969 ip_allocmoptions(int how)
2970 {
2971         struct ip_moptions *imo;
2972
2973         imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
2974         if (imo != NULL) {
2975                 bzero(imo, imo_size);
2976                 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
2977                 imo->imo_debug |= IFD_ALLOC;
2978                 if (imo_debug != 0) {
2979                         imo->imo_debug |= IFD_DEBUG;
2980                         imo->imo_trace = imo_trace;
2981                 }
2982                 IMO_ADDREF(imo);
2983         }
2984
2985         return (imo);
2986 }
2987
2988 /*
2989  * Routine called from ip_output() to loop back a copy of an IP multicast
2990  * packet to the input queue of a specified interface.  Note that this
2991  * calls the output routine of the loopback "driver", but with an interface
2992  * pointer that might NOT be a loopback interface -- evil, but easier than
2993  * replicating that code here.
2994  */
2995 static void
2996 ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
2997     struct sockaddr_in *dst, int hlen)
2998 {
2999         struct mbuf *copym;
3000         struct ip *ip;
3001
3002         if (lo_ifp == NULL)
3003                 return;
3004
3005         /*
3006          * Copy the packet header as it's needed for the checksum
3007          * Make sure to deep-copy IP header portion in case the data
3008          * is in an mbuf cluster, so that we can safely override the IP
3009          * header portion later.
3010          */
3011         copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR);
3012         if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen))
3013                 copym = m_pullup(copym, hlen);
3014
3015         if (copym == NULL)
3016                 return;
3017
3018         /*
3019          * We don't bother to fragment if the IP length is greater
3020          * than the interface's MTU.  Can this possibly matter?
3021          */
3022         ip = mtod(copym, struct ip *);
3023 #if BYTE_ORDER != BIG_ENDIAN
3024         HTONS(ip->ip_len);
3025         HTONS(ip->ip_off);
3026 #endif
3027         ip->ip_sum = 0;
3028         ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
3029
3030         /*
3031          * Mark checksum as valid unless receive checksum offload is
3032          * disabled; if so, compute checksum in software.  If the
3033          * interface itself is lo0, this will be overridden by if_loop.
3034          */
3035         if (hwcksum_rx) {
3036                 copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
3037                 copym->m_pkthdr.csum_flags |=
3038                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
3039                 copym->m_pkthdr.csum_data = 0xffff;
3040         } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
3041 #if BYTE_ORDER != BIG_ENDIAN
3042                 NTOHS(ip->ip_len);
3043 #endif
3044                 in_delayed_cksum(copym);
3045 #if BYTE_ORDER != BIG_ENDIAN
3046                 HTONS(ip->ip_len);
3047 #endif
3048         }
3049
3050         /*
3051          * Stuff the 'real' ifp into the pkthdr, to be used in matching
3052          * in ip_input(); we need the loopback ifp/dl_tag passed as args
3053          * to make the loopback driver compliant with the data link
3054          * requirements.
3055          */
3056         copym->m_pkthdr.rcvif = origifp;
3057
3058         /*
3059          * Also record the source interface (which owns the source address).
3060          * This is basically a stripped down version of ifa_foraddr().
3061          */
3062         if (srcifp == NULL) {
3063                 struct in_ifaddr *ia;
3064
3065                 lck_rw_lock_shared(in_ifaddr_rwlock);
3066                 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3067                         IFA_LOCK_SPIN(&ia->ia_ifa);
3068                         if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3069                                 srcifp = ia->ia_ifp;
3070                                 IFA_UNLOCK(&ia->ia_ifa);
3071                                 break;
3072                         }
3073                         IFA_UNLOCK(&ia->ia_ifa);
3074                 }
3075                 lck_rw_done(in_ifaddr_rwlock);
3076         }
3077         if (srcifp != NULL)
3078                 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3079         ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3080
3081         dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3082 }
3083
3084 /*
3085  * Given a source IP address (and route, if available), determine the best
3086  * interface to send the packet from.  Checking for (and updating) the
3087  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3088  * without any locks based on the assumption that ip_output() is single-
3089  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3090  * performing output at the IP layer.
3091  *
3092  * This routine is analogous to in6_selectroute() for IPv6.
3093  */
3094 static struct ifaddr *
3095 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3096 {
3097         struct ifaddr *ifa = NULL;
3098         struct in_addr src = ip->ip_src;
3099         struct in_addr dst = ip->ip_dst;
3100         struct ifnet *rt_ifp;
3101         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3102
3103         VERIFY(src.s_addr != INADDR_ANY);
3104
3105         if (ip_select_srcif_debug) {
3106                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3107                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3108         }
3109
3110         if (ro->ro_rt != NULL)
3111                 RT_LOCK(ro->ro_rt);
3112
3113         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3114
3115         /*
3116          * Given the source IP address, find a suitable source interface
3117          * to use for transmission; if the caller has specified a scope,
3118          * optimize the search by looking at the addresses only for that
3119          * interface.  This is still suboptimal, however, as we need to
3120          * traverse the per-interface list.
3121          */
3122         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3123                 unsigned int scope = ifscope;
3124
3125                 /*
3126                  * If no scope is specified and the route is stale (pointing
3127                  * to a defunct interface) use the current primary interface;
3128                  * this happens when switching between interfaces configured
3129                  * with the same IP address.  Otherwise pick up the scope
3130                  * information from the route; the ULP may have looked up a
3131                  * correct route and we just need to verify it here and mark
3132                  * it with the ROF_SRCIF_SELECTED flag below.
3133                  */
3134                 if (scope == IFSCOPE_NONE) {
3135                         scope = rt_ifp->if_index;
3136                         if (scope != get_primary_ifscope(AF_INET) &&
3137                             ROUTE_UNUSABLE(ro))
3138                                 scope = get_primary_ifscope(AF_INET);
3139                 }
3140
3141                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3142
3143                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3144                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3145                         /*
3146                          * If forwarding is enabled, and if the packet isn't
3147                          * TCP or UDP, check if the source address belongs
3148                          * to one of our own interfaces; if so, demote the
3149                          * interface scope and do a route lookup right below.
3150                          */
3151                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3152                         if (ifa != NULL) {
3153                                 IFA_REMREF(ifa);
3154                                 ifa = NULL;
3155                                 ifscope = IFSCOPE_NONE;
3156                         }
3157                 }
3158
3159                 if (ip_select_srcif_debug && ifa != NULL) {
3160                         if (ro->ro_rt != NULL) {
3161                                 printf("%s->%s ifscope %d->%d ifa_if %s "
3162                                     "ro_if %s\n", s_src, s_dst, ifscope,
3163                                     scope, if_name(ifa->ifa_ifp),
3164                                     if_name(rt_ifp));
3165                         } else {
3166                                 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3167                                     s_src, s_dst, ifscope, scope,
3168                                     if_name(ifa->ifa_ifp));
3169                         }
3170                 }
3171         }
3172
3173         /*
3174          * Slow path; search for an interface having the corresponding source
3175          * IP address if the scope was not specified by the caller, and:
3176          *
3177          *   1) There currently isn't any route, or,
3178          *   2) The interface used by the route does not own that source
3179          *      IP address; in this case, the route will get blown away
3180          *      and we'll do a more specific scoped search using the newly
3181          *      found interface.
3182          */
3183         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3184                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3185
3186                 /*
3187                  * If we have the IP address, but not the route, we don't
3188                  * really know whether or not it belongs to the correct
3189                  * interface (it could be shared across multiple interfaces.)
3190                  * The only way to find out is to do a route lookup.
3191                  */
3192                 if (ifa != NULL && ro->ro_rt == NULL) {
3193                         struct rtentry *rt;
3194                         struct sockaddr_in sin;
3195                         struct ifaddr *oifa = NULL;
3196
3197                         bzero(&sin, sizeof (sin));
3198                         sin.sin_family = AF_INET;
3199                         sin.sin_len = sizeof (sin);
3200                         sin.sin_addr = dst;
3201
3202                         lck_mtx_lock(rnh_lock);
3203                         if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3204                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3205                                 RT_LOCK(rt);
3206                                 /*
3207                                  * If the route uses a different interface,
3208                                  * use that one instead.  The IP address of
3209                                  * the ifaddr that we pick up here is not
3210                                  * relevant.
3211                                  */
3212                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3213                                         oifa = ifa;
3214                                         ifa = rt->rt_ifa;
3215                                         IFA_ADDREF(ifa);
3216                                         RT_UNLOCK(rt);
3217                                 } else {
3218                                         RT_UNLOCK(rt);
3219                                 }
3220                                 rtfree_locked(rt);
3221                         }
3222                         lck_mtx_unlock(rnh_lock);
3223
3224                         if (oifa != NULL) {
3225                                 struct ifaddr *iifa;
3226
3227                                 /*
3228                                  * See if the interface pointed to by the
3229                                  * route is configured with the source IP
3230                                  * address of the packet.
3231                                  */
3232                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3233                                     src.s_addr, ifa->ifa_ifp->if_index);
3234
3235                                 if (iifa != NULL) {
3236                                         /*
3237                                          * Found it; drop the original one
3238                                          * as well as the route interface
3239                                          * address, and use this instead.
3240                                          */
3241                                         IFA_REMREF(oifa);
3242                                         IFA_REMREF(ifa);
3243                                         ifa = iifa;
3244                                 } else if (!ipforwarding ||
3245                                     (rt->rt_flags & RTF_GATEWAY)) {
3246                                         /*
3247                                          * This interface doesn't have that
3248                                          * source IP address; drop the route
3249                                          * interface address and just use the
3250                                          * original one, and let the caller
3251                                          * do a scoped route lookup.
3252                                          */
3253                                         IFA_REMREF(ifa);
3254                                         ifa = oifa;
3255                                 } else {
3256                                         /*
3257                                          * Forwarding is enabled and the source
3258                                          * address belongs to one of our own
3259                                          * interfaces which isn't the outgoing
3260                                          * interface, and we have a route, and
3261                                          * the destination is on a network that
3262                                          * is directly attached (onlink); drop
3263                                          * the original one and use the route
3264                                          * interface address instead.
3265                                          */
3266                                         IFA_REMREF(oifa);
3267                                 }
3268                         }
3269                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3270                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3271                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3272                         /*
3273                          * Forwarding is enabled and the source address belongs
3274                          * to one of our own interfaces which isn't the same
3275                          * as the interface used by the known route; drop the
3276                          * original one and use the route interface address.
3277                          */
3278                         IFA_REMREF(ifa);
3279                         ifa = ro->ro_rt->rt_ifa;
3280                         IFA_ADDREF(ifa);
3281                 }
3282
3283                 if (ip_select_srcif_debug && ifa != NULL) {
3284                         printf("%s->%s ifscope %d ifa_if %s\n",
3285                             s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3286                 }
3287         }
3288
3289         if (ro->ro_rt != NULL)
3290                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3291         /*
3292          * If there is a non-loopback route with the wrong interface, or if
3293          * there is no interface configured with such an address, blow it
3294          * away.  Except for local/loopback, we look for one with a matching
3295          * interface scope/index.
3296          */
3297         if (ro->ro_rt != NULL &&
3298             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3299             !(ro->ro_rt->rt_flags & RTF_UP))) {
3300                 if (ip_select_srcif_debug) {
3301                         if (ifa != NULL) {
3302                                 printf("%s->%s ifscope %d ro_if %s != "
3303                                     "ifa_if %s (cached route cleared)\n",
3304                                     s_src, s_dst, ifscope, if_name(rt_ifp),
3305                                     if_name(ifa->ifa_ifp));
3306                         } else {
3307                                 printf("%s->%s ifscope %d ro_if %s "
3308                                     "(no ifa_if found)\n",
3309                                     s_src, s_dst, ifscope, if_name(rt_ifp));
3310                         }
3311                 }
3312
3313                 RT_UNLOCK(ro->ro_rt);
3314                 ROUTE_RELEASE(ro);
3315
3316                 /*
3317                  * If the destination is IPv4 LLA and the route's interface
3318                  * doesn't match the source interface, then the source IP
3319                  * address is wrong; it most likely belongs to the primary
3320                  * interface associated with the IPv4 LL subnet.  Drop the
3321                  * packet rather than letting it go out and return an error
3322                  * to the ULP.  This actually applies not only to IPv4 LL
3323                  * but other shared subnets; for now we explicitly test only
3324                  * for the former case and save the latter for future.
3325                  */
3326                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3327                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3328                         IFA_REMREF(ifa);
3329                         ifa = NULL;
3330                 }
3331         }
3332
3333         if (ip_select_srcif_debug && ifa == NULL) {
3334                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3335                     s_src, s_dst, ifscope);
3336         }
3337
3338         /*
3339          * If there is a route, mark it accordingly.  If there isn't one,
3340          * we'll get here again during the next transmit (possibly with a
3341          * route) and the flag will get set at that point.  For IPv4 LLA
3342          * destination, mark it only if the route has been fully resolved;
3343          * otherwise we want to come back here again when the route points
3344          * to the interface over which the ARP reply arrives on.
3345          */
3346         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3347             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3348             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3349                 if (ifa != NULL)
3350                         IFA_ADDREF(ifa);        /* for route */
3351                 if (ro->ro_srcia != NULL)
3352                         IFA_REMREF(ro->ro_srcia);
3353                 ro->ro_srcia = ifa;
3354                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3355                 RT_GENID_SYNC(ro->ro_rt);
3356         }
3357
3358         if (ro->ro_rt != NULL)
3359                 RT_UNLOCK(ro->ro_rt);
3360
3361         return (ifa);
3362 }
3363
3364 void
3365 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3366     uint32_t *sw_csum)
3367 {
3368         int tso = TSO_IPV4_OK(ifp, m);
3369         uint32_t hwcap = ifp->if_hwassist;
3370
3371         m->m_pkthdr.csum_flags |= CSUM_IP;
3372
3373         if (!hwcksum_tx) {
3374                 /* do all in software; hardware checksum offload is disabled */
3375                 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3376                     m->m_pkthdr.csum_flags;
3377         } else {
3378                 /* do in software what the hardware cannot */
3379                 *sw_csum = m->m_pkthdr.csum_flags &
3380                     ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3381         }
3382
3383         if (hlen != sizeof (struct ip)) {
3384                 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3385                     m->m_pkthdr.csum_flags);
3386         } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3387                 /*
3388                  * Partial checksum offload, if non-IP fragment, and TCP only
3389                  * (no UDP support, as the hardware may not be able to convert
3390                  * +0 to -0 (0xffff) per RFC1122 4.1.3.4.)
3391                  */
3392                 if (hwcksum_tx && !tso &&
3393                     (m->m_pkthdr.csum_flags & CSUM_TCP) &&
3394                     ip_len <= ifp->if_mtu) {
3395                         uint16_t start = sizeof (struct ip);
3396                         uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3397                         m->m_pkthdr.csum_flags |=
3398                             (CSUM_DATA_VALID | CSUM_PARTIAL);
3399                         m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3400                         m->m_pkthdr.csum_tx_start = start;
3401                         /* do IP hdr chksum in software */
3402                         *sw_csum = CSUM_DELAY_IP;
3403                 } else {
3404                         *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3405                 }
3406         }
3407
3408         if (*sw_csum & CSUM_DELAY_DATA) {
3409                 in_delayed_cksum(m);
3410                 *sw_csum &= ~CSUM_DELAY_DATA;
3411         }
3412
3413         if (hwcksum_tx) {
3414                 /*
3415                  * Drop off bits that aren't supported by hardware;
3416                  * also make sure to preserve non-checksum related bits.
3417                  */
3418                 m->m_pkthdr.csum_flags =
3419                     ((m->m_pkthdr.csum_flags &
3420                     (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) |
3421                     (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3422         } else {
3423                 /* drop all bits; hardware checksum offload is disabled */
3424                 m->m_pkthdr.csum_flags = 0;
3425         }
3426 }
3427
3428 /*
3429  * GRE protocol output for PPP/PPTP
3430  */
3431 int
3432 ip_gre_output(struct mbuf *m)
3433 {
3434         struct route ro;
3435         int error;
3436
3437         bzero(&ro, sizeof (ro));
3438
3439         error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3440
3441         ROUTE_RELEASE(&ro);
3442
3443         return (error);
3444 }