bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  */
  62 /*
  63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  64  * support for mandatory and extensible security protections.  This notice
  65  * is included in support of clause 2.2 (b) of the Apple Public License,
  66  * Version 2.0.
  67  */
  68
  69 #define _IP_VHL
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/socketvar.h>
  79 #include <kern/locks.h>
  80 #include <sys/sysctl.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kdebug.h>
  83
  84 #include <machine/endian.h>
  85 #include <pexpert/pexpert.h>
  86 #include <mach/sdt.h>
  87
  88 #include <libkern/OSAtomic.h>
  89 #include <libkern/OSByteOrder.h>
  90
  91 #include <net/if.h>
  92 #include <net/if_dl.h>
  93 #include <net/if_types.h>
  94 #include <net/route.h>
  95 #include <net/ntstat.h>
  96 #include <net/net_osdep.h>
  97 #include <net/dlil.h>
  98 #include <net/net_perf.h>
  99
 100 #include <netinet/in.h>
 101 #include <netinet/in_systm.h>
 102 #include <netinet/ip.h>
 103 #include <netinet/in_pcb.h>
 104 #include <netinet/in_var.h>
 105 #include <netinet/ip_var.h>
 106 #include <netinet/kpi_ipfilter_var.h>
 107 #include <netinet/in_tclass.h>
 108 #include <netinet/udp.h>
 109
 110 #include <netinet6/nd6.h>
 111
 112 #if CONFIG_MACF_NET
 113 #include <security/mac_framework.h>
 114 #endif /* CONFIG_MACF_NET */
 115
 116 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 117 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 118 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 119 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 120
 121 #if IPSEC
 122 #include <netinet6/ipsec.h>
 123 #include <netkey/key.h>
 124 #if IPSEC_DEBUG
 125 #include <netkey/key_debug.h>
 126 #else
 127 #define KEYDEBUG(lev, arg)
 128 #endif
 129 #endif /* IPSEC */
 130
 131 #if NECP
 132 #include <net/necp.h>
 133 #endif /* NECP */
 134
 135 #if IPFIREWALL
 136 #include <netinet/ip_fw.h>
 137 #if IPDIVERT
 138 #include <netinet/ip_divert.h>
 139 #endif /* IPDIVERT */
 140 #endif /* IPFIREWALL */
 141
 142 #if DUMMYNET
 143 #include <netinet/ip_dummynet.h>
 144 #endif
 145
 146 #if PF
 147 #include <net/pfvar.h>
 148 #endif /* PF */
 149
 150 #if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG
 151 #define print_ip(a)     \
 152         printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF,       \
 153             (ntohl(a.s_addr) >> 16) & 0xFF,                             \
 154             (ntohl(a.s_addr) >> 8) & 0xFF,                              \
 155             (ntohl(a.s_addr)) & 0xFF);
 156 #endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */
 157
 158 u_short ip_id;
 159
 160 static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
 161 static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
 162 static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
 163 static void ip_out_cksum_stats(int, u_int32_t);
 164 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 165 static int ip_optcopy(struct ip *, struct ip *);
 166 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
 167 static void imo_trace(struct ip_moptions *, int);
 168 static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
 169     struct sockaddr_in *, int);
 170 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 171
 172 extern struct ip_linklocal_stat ip_linklocal_stat;
 173
 174 /* temporary: for testing */
 175 #if IPSEC
 176 extern int ipsec_bypass;
 177 #endif
 178
 179 static int ip_maxchainsent = 0;
 180 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
 181     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
 182     "use dlil_output_list");
 183 #if DEBUG
 184 static int forge_ce = 0;
 185 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
 186     CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
 187     "Forge ECN CE");
 188 #endif /* DEBUG */
 189
 190 static int ip_select_srcif_debug = 0;
 191 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
 192     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
 193     "log source interface selection debug info");
 194
 195 static int ip_output_measure = 0;
 196 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
 197     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 198     &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
 199     "Do time measurement");
 200
 201 static uint64_t ip_output_measure_bins = 0;
 202 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
 203     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
 204     sysctl_ip_output_measure_bins, "I",
 205     "bins for chaining performance data histogram");
 206
 207 static net_perf_t net_perf;
 208 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
 209     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 210     0, 0, sysctl_ip_output_getperf, "S,net_perf",
 211     "IP output performance data (struct net_perf, net/net_perf.h)");
 212
 213 __private_extern__ int rfc6864 = 1;
 214 SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
 215     &rfc6864, 0, "updated ip id field behavior");
 216
 217 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 218
 219 /* For gdb */
 220 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
 221
 222 struct ip_moptions_dbg {
 223         struct ip_moptions      imo;                    /* ip_moptions */
 224         u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
 225         u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
 226         /*
 227          * Alloc and free callers.
 228          */
 229         ctrace_t                imo_alloc;
 230         ctrace_t                imo_free;
 231         /*
 232          * Circular lists of IMO_ADDREF and IMO_REMREF callers.
 233          */
 234         ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
 235         ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
 236 };
 237
 238 #if DEBUG
 239 static unsigned int imo_debug = 1;      /* debugging (enabled) */
 240 #else
 241 static unsigned int imo_debug;          /* debugging (disabled) */
 242 #endif /* !DEBUG */
 243 static unsigned int imo_size;           /* size of zone element */
 244 static struct zone *imo_zone;           /* zone for ip_moptions */
 245
 246 #define IMO_ZONE_MAX            64              /* maximum elements in zone */
 247 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
 248
 249 /*
 250  * IP output.  The packet in mbuf chain m contains a skeletal IP
 251  * header (with len, off, ttl, proto, tos, src, dst).
 252  * The mbuf chain containing the packet will be freed.
 253  * The mbuf opt, if present, will not be freed.
 254  */
 255 int
 256 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
 257     struct ip_moptions *imo, struct ip_out_args *ipoa)
 258 {
 259         return ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
 260 }
 261
 262 /*
 263  * IP output.  The packet in mbuf chain m contains a skeletal IP
 264  * header (with len, off, ttl, proto, tos, src, dst).
 265  * The mbuf chain containing the packet will be freed.
 266  * The mbuf opt, if present, will not be freed.
 267  *
 268  * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
 269  * skipped and ro->ro_rt would be used.  Otherwise the result of route
 270  * lookup is stored in ro->ro_rt.
 271  *
 272  * In the IP forwarding case, the packet will arrive with options already
 273  * inserted, so must have a NULL opt pointer.
 274  */
 275 int
 276 ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
 277     struct route *ro, int flags, struct ip_moptions *imo,
 278     struct ip_out_args *ipoa)
 279 {
 280         struct ip *ip;
 281         struct ifnet *ifp = NULL;               /* not refcnt'd */
 282         struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
 283         int hlen = sizeof(struct ip);
 284         int len = 0, error = 0;
 285         struct sockaddr_in *dst = NULL;
 286         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 287         struct in_addr pkt_dst;
 288         struct ipf_pktopts *ippo = NULL;
 289         ipfilter_t inject_filter_ref = NULL;
 290         struct mbuf *packetlist;
 291         uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
 292         uint32_t packets_processed = 0;
 293         unsigned int ifscope = IFSCOPE_NONE;
 294         struct flowadv *adv = NULL;
 295         struct timeval start_tv;
 296 #if IPSEC
 297         struct socket *so = NULL;
 298         struct secpolicy *sp = NULL;
 299 #endif /* IPSEC */
 300 #if NECP
 301         necp_kernel_policy_result necp_result = 0;
 302         necp_kernel_policy_result_parameter necp_result_parameter;
 303         necp_kernel_policy_id necp_matched_policy_id = 0;
 304 #endif /* NECP */
 305 #if IPFIREWALL
 306         int ipfwoff;
 307         struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
 308 #endif /* IPFIREWALL */
 309 #if IPFIREWALL || DUMMYNET
 310         struct m_tag *tag;
 311 #endif /* IPFIREWALL || DUMMYNET */
 312 #if DUMMYNET
 313         struct ip_out_args saved_ipoa;
 314         struct sockaddr_in dst_buf;
 315 #endif /* DUMMYNET */
 316         struct {
 317 #if IPSEC
 318                 struct ipsec_output_state ipsec_state;
 319 #endif /* IPSEC */
 320 #if NECP
 321                 struct route necp_route;
 322 #endif /* NECP */
 323 #if IPFIREWALL || DUMMYNET
 324                 struct ip_fw_args args;
 325 #endif /* IPFIREWALL || DUMMYNET */
 326 #if IPFIREWALL_FORWARD
 327                 struct route sro_fwd;
 328 #endif /* IPFIREWALL_FORWARD */
 329 #if DUMMYNET
 330                 struct route saved_route;
 331 #endif /* DUMMYNET */
 332                 struct ipf_pktopts ipf_pktopts;
 333         } ipobz;
 334 #define ipsec_state     ipobz.ipsec_state
 335 #define necp_route      ipobz.necp_route
 336 #define args            ipobz.args
 337 #define sro_fwd         ipobz.sro_fwd
 338 #define saved_route     ipobz.saved_route
 339 #define ipf_pktopts     ipobz.ipf_pktopts
 340         union {
 341                 struct {
 342                         boolean_t select_srcif : 1;     /* set once */
 343                         boolean_t srcbound : 1;         /* set once */
 344                         boolean_t nocell : 1;           /* set once */
 345                         boolean_t isbroadcast : 1;
 346                         boolean_t didfilter : 1;
 347                         boolean_t noexpensive : 1;      /* set once */
 348                         boolean_t noconstrained : 1;      /* set once */
 349                         boolean_t awdl_unrestricted : 1;        /* set once */
 350 #if IPFIREWALL_FORWARD
 351                         boolean_t fwd_rewrite_src : 1;
 352 #endif /* IPFIREWALL_FORWARD */
 353                 };
 354                 uint32_t raw;
 355         } ipobf = { .raw = 0 };
 356
 357         int interface_mtu = 0;
 358
 359 /*
 360  * Here we check for restrictions when sending frames.
 361  * N.B.: IPv4 over internal co-processor interfaces is not allowed.
 362  */
 363 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                             \
 364         (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                \
 365          ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||          \
 366          ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) ||      \
 367           (IFNET_IS_INTCOPROC(_ifp)) ||                                 \
 368          (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
 369
 370         if (ip_output_measure) {
 371                 net_perf_start_time(&net_perf, &start_tv);
 372         }
 373         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 374
 375         VERIFY(m0->m_flags & M_PKTHDR);
 376         packetlist = m0;
 377
 378         /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
 379         bzero(&ipobz, sizeof(ipobz));
 380         ippo = &ipf_pktopts;
 381
 382 #if IPFIREWALL || DUMMYNET
 383         if (SLIST_EMPTY(&m0->m_pkthdr.tags)) {
 384                 goto ipfw_tags_done;
 385         }
 386
 387         /* Grab info from mtags prepended to the chain */
 388 #if DUMMYNET
 389         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 390             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 391                 struct dn_pkt_tag       *dn_tag;
 392
 393                 dn_tag = (struct dn_pkt_tag *)(tag + 1);
 394                 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
 395                 args.fwa_pf_rule = dn_tag->dn_pf_rule;
 396                 opt = NULL;
 397                 saved_route = dn_tag->dn_ro;
 398                 ro = &saved_route;
 399
 400                 imo = NULL;
 401                 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
 402                 dst = &dst_buf;
 403                 ifp = dn_tag->dn_ifp;
 404                 flags = dn_tag->dn_flags;
 405                 if ((dn_tag->dn_flags & IP_OUTARGS)) {
 406                         saved_ipoa = dn_tag->dn_ipoa;
 407                         ipoa = &saved_ipoa;
 408                 }
 409
 410                 m_tag_delete(m0, tag);
 411         }
 412 #endif /* DUMMYNET */
 413
 414 #if IPDIVERT
 415         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 416             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
 417                 struct divert_tag       *div_tag;
 418
 419                 div_tag = (struct divert_tag *)(tag + 1);
 420                 args.fwa_divert_rule = div_tag->cookie;
 421
 422                 m_tag_delete(m0, tag);
 423         }
 424 #endif /* IPDIVERT */
 425
 426 #if IPFIREWALL
 427         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 428             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
 429                 struct ip_fwd_tag       *ipfwd_tag;
 430
 431                 ipfwd_tag = (struct ip_fwd_tag *)(tag + 1);
 432                 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
 433
 434                 m_tag_delete(m0, tag);
 435         }
 436 #endif /* IPFIREWALL */
 437
 438 ipfw_tags_done:
 439 #endif /* IPFIREWALL || DUMMYNET */
 440
 441         m = m0;
 442         m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP | PKTF_IFAINFO);
 443
 444 #if IPSEC
 445         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 446                 /* If packet is bound to an interface, check bound policies */
 447                 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
 448                     (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 449                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 450                         if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
 451                             &flags, ipoa, &sp) != 0) {
 452                                 goto bad;
 453                         }
 454                 }
 455         }
 456 #endif /* IPSEC */
 457
 458         VERIFY(ro != NULL);
 459
 460         if (flags & IP_OUTARGS) {
 461                 /*
 462                  * In the forwarding case, only the ifscope value is used,
 463                  * as source interface selection doesn't take place.
 464                  */
 465                 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
 466                     (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
 467                         ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
 468                 }
 469
 470                 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 471                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 472                         ifscope = ipoa->ipoa_boundif;
 473                         ipf_pktopts.ippo_flags |=
 474                             (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
 475                 }
 476
 477                 /* double negation needed for bool bit field */
 478                 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
 479                 if (ipobf.srcbound) {
 480                         ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
 481                 }
 482         } else {
 483                 ipobf.select_srcif = FALSE;
 484                 ipobf.srcbound = FALSE;
 485                 ifscope = IFSCOPE_NONE;
 486                 if (flags & IP_OUTARGS) {
 487                         ipoa->ipoa_boundif = IFSCOPE_NONE;
 488                         ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
 489                             IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
 490                 }
 491         }
 492
 493         if (flags & IP_OUTARGS) {
 494                 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
 495                         ipobf.nocell = TRUE;
 496                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
 497                 }
 498                 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
 499                         ipobf.noexpensive = TRUE;
 500                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
 501                 }
 502                 if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
 503                         ipobf.noconstrained = TRUE;
 504                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
 505                 }
 506                 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
 507                         ipobf.awdl_unrestricted = TRUE;
 508                 }
 509                 adv = &ipoa->ipoa_flowadv;
 510                 adv->code = FADV_SUCCESS;
 511                 ipoa->ipoa_retflags = 0;
 512         }
 513
 514 #if IPSEC
 515         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 516                 so = ipsec_getsocket(m);
 517                 if (so != NULL) {
 518                         (void) ipsec_setsocket(m, NULL);
 519                 }
 520         }
 521 #endif /* IPSEC */
 522
 523 #if DUMMYNET
 524         if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) {
 525                 /* dummynet already saw us */
 526                 ip = mtod(m, struct ip *);
 527                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 528                 pkt_dst = ip->ip_dst;
 529                 if (ro->ro_rt != NULL) {
 530                         RT_LOCK_SPIN(ro->ro_rt);
 531                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 532                         if (ia) {
 533                                 /* Become a regular mutex */
 534                                 RT_CONVERT_LOCK(ro->ro_rt);
 535                                 IFA_ADDREF(&ia->ia_ifa);
 536                         }
 537                         RT_UNLOCK(ro->ro_rt);
 538                 }
 539
 540 #if IPFIREWALL
 541                 if (args.fwa_ipfw_rule != NULL) {
 542                         goto skip_ipsec;
 543                 }
 544 #endif /* IPFIREWALL  */
 545                 if (args.fwa_pf_rule != NULL) {
 546                         goto sendit;
 547                 }
 548         }
 549 #endif /* DUMMYNET */
 550
 551 loopit:
 552         packets_processed++;
 553         ipobf.isbroadcast = FALSE;
 554         ipobf.didfilter = FALSE;
 555 #if IPFIREWALL_FORWARD
 556         ipobf.fwd_rewrite_src = FALSE;
 557 #endif /* IPFIREWALL_FORWARD */
 558
 559         VERIFY(m->m_flags & M_PKTHDR);
 560         /*
 561          * No need to proccess packet twice if we've already seen it.
 562          */
 563         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
 564                 inject_filter_ref = ipf_get_inject_filter(m);
 565         } else {
 566                 inject_filter_ref = NULL;
 567         }
 568
 569         if (opt) {
 570                 m = ip_insertoptions(m, opt, &len);
 571                 hlen = len;
 572                 /* Update the chain */
 573                 if (m != m0) {
 574                         if (m0 == packetlist) {
 575                                 packetlist = m;
 576                         }
 577                         m0 = m;
 578                 }
 579         }
 580         ip = mtod(m, struct ip *);
 581
 582 #if IPFIREWALL
 583         /*
 584          * rdar://8542331
 585          *
 586          * When dealing with a packet chain, we need to reset "next_hop"
 587          * because "dst" may have been changed to the gateway address below
 588          * for the previous packet of the chain. This could cause the route
 589          * to be inavertandly changed to the route to the gateway address
 590          * (instead of the route to the destination).
 591          */
 592         args.fwa_next_hop = next_hop_from_ipfwd_tag;
 593         pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst;
 594 #else /* !IPFIREWALL */
 595         pkt_dst = ip->ip_dst;
 596 #endif /* !IPFIREWALL */
 597
 598         /*
 599          * We must not send if the packet is destined to network zero.
 600          * RFC1122 3.2.1.3 (a) and (b).
 601          */
 602         if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
 603                 error = EHOSTUNREACH;
 604                 goto bad;
 605         }
 606
 607         /*
 608          * Fill in IP header.
 609          */
 610         if (!(flags & (IP_FORWARDING | IP_RAWOUTPUT))) {
 611                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 612                 ip->ip_off &= IP_DF;
 613                 if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
 614                         // Per RFC6864, value of ip_id is undefined for atomic ip packets
 615                         ip->ip_id = 0;
 616                 } else {
 617                         ip->ip_id = ip_randomid();
 618                 }
 619                 OSAddAtomic(1, &ipstat.ips_localout);
 620         } else {
 621                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 622         }
 623
 624 #if DEBUG
 625         /* For debugging, we let the stack forge congestion */
 626         if (forge_ce != 0 &&
 627             ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 628             (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 629                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 630                 forge_ce--;
 631         }
 632 #endif /* DEBUG */
 633
 634         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
 635             ip->ip_p, ip->ip_off, ip->ip_len);
 636
 637         dst = SIN(&ro->ro_dst);
 638
 639         /*
 640          * If there is a cached route,
 641          * check that it is to the same destination
 642          * and is still up.  If not, free it and try again.
 643          * The address family should also be checked in case of sharing the
 644          * cache with IPv6.
 645          */
 646
 647         if (ro->ro_rt != NULL) {
 648                 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
 649                     !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
 650                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 651                         if (src_ia == NULL) {
 652                                 error = EADDRNOTAVAIL;
 653                                 goto bad;
 654                         }
 655                         IFA_REMREF(&src_ia->ia_ifa);
 656                         src_ia = NULL;
 657                 }
 658                 /*
 659                  * Test rt_flags without holding rt_lock for performance
 660                  * reasons; if the route is down it will hopefully be
 661                  * caught by the layer below (since it uses this route
 662                  * as a hint) or during the next transmit.
 663                  */
 664                 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
 665                     dst->sin_addr.s_addr != pkt_dst.s_addr) {
 666                         ROUTE_RELEASE(ro);
 667                 }
 668
 669                 /*
 670                  * If we're doing source interface selection, we may not
 671                  * want to use this route; only synch up the generation
 672                  * count otherwise.
 673                  */
 674                 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
 675                     RT_GENID_OUTOFSYNC(ro->ro_rt)) {
 676                         RT_GENID_SYNC(ro->ro_rt);
 677                 }
 678         }
 679         if (ro->ro_rt == NULL) {
 680                 bzero(dst, sizeof(*dst));
 681                 dst->sin_family = AF_INET;
 682                 dst->sin_len = sizeof(*dst);
 683                 dst->sin_addr = pkt_dst;
 684         }
 685         /*
 686          * If routing to interface only,
 687          * short circuit routing lookup.
 688          */
 689         if (flags & IP_ROUTETOIF) {
 690                 if (ia != NULL) {
 691                         IFA_REMREF(&ia->ia_ifa);
 692                 }
 693                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
 694                         ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
 695                         if (ia == NULL) {
 696                                 OSAddAtomic(1, &ipstat.ips_noroute);
 697                                 error = ENETUNREACH;
 698                                 /* XXX IPv6 APN fallback notification?? */
 699                                 goto bad;
 700                         }
 701                 }
 702                 ifp = ia->ia_ifp;
 703                 ip->ip_ttl = 1;
 704                 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 705                 /*
 706                  * For consistency with other cases below.  Loopback
 707                  * multicast case is handled separately by ip_mloopback().
 708                  */
 709                 if ((ifp->if_flags & IFF_LOOPBACK) &&
 710                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 711                         m->m_pkthdr.rcvif = ifp;
 712                         ip_setsrcifaddr_info(m, ifp->if_index, NULL);
 713                         ip_setdstifaddr_info(m, ifp->if_index, NULL);
 714                 }
 715         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 716             imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
 717                 /*
 718                  * Bypass the normal routing lookup for multicast
 719                  * packets if the interface is specified.
 720                  */
 721                 ipobf.isbroadcast = FALSE;
 722                 if (ia != NULL) {
 723                         IFA_REMREF(&ia->ia_ifa);
 724                 }
 725
 726                 /* Macro takes reference on ia */
 727                 IFP_TO_IA(ifp, ia);
 728         } else {
 729                 struct ifaddr *ia0 = NULL;
 730                 boolean_t cloneok = FALSE;
 731                 /*
 732                  * Perform source interface selection; the source IP address
 733                  * must belong to one of the addresses of the interface used
 734                  * by the route.  For performance reasons, do this only if
 735                  * there is no route, or if the routing table has changed,
 736                  * or if we haven't done source interface selection on this
 737                  * route (for this PCB instance) before.
 738                  */
 739                 if (ipobf.select_srcif &&
 740                     ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
 741                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 742                         /* Find the source interface */
 743                         ia0 = in_selectsrcif(ip, ro, ifscope);
 744
 745                         /*
 746                          * If the source address belongs to a restricted
 747                          * interface and the caller forbids our using
 748                          * interfaces of such type, pretend that there is no
 749                          * route.
 750                          */
 751                         if (ia0 != NULL &&
 752                             IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
 753                                 IFA_REMREF(ia0);
 754                                 ia0 = NULL;
 755                                 error = EHOSTUNREACH;
 756                                 if (flags & IP_OUTARGS) {
 757                                         ipoa->ipoa_retflags |= IPOARF_IFDENIED;
 758                                 }
 759                                 goto bad;
 760                         }
 761
 762                         /*
 763                          * If the source address is spoofed (in the case of
 764                          * IP_RAWOUTPUT on an unbounded socket), or if this
 765                          * is destined for local/loopback, just let it go out
 766                          * using the interface of the route.  Otherwise,
 767                          * there's no interface having such an address,
 768                          * so bail out.
 769                          */
 770                         if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
 771                             ipobf.srcbound) && ifscope != lo_ifp->if_index) {
 772                                 error = EADDRNOTAVAIL;
 773                                 goto bad;
 774                         }
 775
 776                         /*
 777                          * If the caller didn't explicitly specify the scope,
 778                          * pick it up from the source interface.  If the cached
 779                          * route was wrong and was blown away as part of source
 780                          * interface selection, don't mask out RTF_PRCLONING
 781                          * since that route may have been allocated by the ULP,
 782                          * unless the IP header was created by the caller or
 783                          * the destination is IPv4 LLA.  The check for the
 784                          * latter is needed because IPv4 LLAs are never scoped
 785                          * in the current implementation, and we don't want to
 786                          * replace the resolved IPv4 LLA route with one whose
 787                          * gateway points to that of the default gateway on
 788                          * the primary interface of the system.
 789                          */
 790                         if (ia0 != NULL) {
 791                                 if (ifscope == IFSCOPE_NONE) {
 792                                         ifscope = ia0->ifa_ifp->if_index;
 793                                 }
 794                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 795                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 796                         }
 797                 }
 798
 799                 /*
 800                  * If this is the case, we probably don't want to allocate
 801                  * a protocol-cloned route since we didn't get one from the
 802                  * ULP.  This lets TCP do its thing, while not burdening
 803                  * forwarding or ICMP with the overhead of cloning a route.
 804                  * Of course, we still want to do any cloning requested by
 805                  * the link layer, as this is probably required in all cases
 806                  * for correct operation (as it is for ARP).
 807                  */
 808                 if (ro->ro_rt == NULL) {
 809                         unsigned long ign = RTF_PRCLONING;
 810                         /*
 811                          * We make an exception here: if the destination
 812                          * address is INADDR_BROADCAST, allocate a protocol-
 813                          * cloned host route so that we end up with a route
 814                          * marked with the RTF_BROADCAST flag.  Otherwise,
 815                          * we would end up referring to the default route,
 816                          * instead of creating a cloned host route entry.
 817                          * That would introduce inconsistencies between ULPs
 818                          * that allocate a route and those that don't.  The
 819                          * RTF_BROADCAST route is important since we'd want
 820                          * to send out undirected IP broadcast packets using
 821                          * link-level broadcast address. Another exception
 822                          * is for ULP-created routes that got blown away by
 823                          * source interface selection (see above).
 824                          *
 825                          * These exceptions will no longer be necessary when
 826                          * the RTF_PRCLONING scheme is no longer present.
 827                          */
 828                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) {
 829                                 ign &= ~RTF_PRCLONING;
 830                         }
 831
 832                         /*
 833                          * Loosen the route lookup criteria if the ifscope
 834                          * corresponds to the loopback interface; this is
 835                          * needed to support Application Layer Gateways
 836                          * listening on loopback, in conjunction with packet
 837                          * filter redirection rules.  The final source IP
 838                          * address will be rewritten by the packet filter
 839                          * prior to the RFC1122 loopback check below.
 840                          */
 841                         if (ifscope == lo_ifp->if_index) {
 842                                 rtalloc_ign(ro, ign);
 843                         } else {
 844                                 rtalloc_scoped_ign(ro, ign, ifscope);
 845                         }
 846
 847                         /*
 848                          * If the route points to a cellular/expensive interface
 849                          * and the caller forbids our using interfaces of such type,
 850                          * pretend that there is no route.
 851                          */
 852                         if (ro->ro_rt != NULL) {
 853                                 RT_LOCK_SPIN(ro->ro_rt);
 854                                 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
 855                                     ipobf)) {
 856                                         RT_UNLOCK(ro->ro_rt);
 857                                         ROUTE_RELEASE(ro);
 858                                         if (flags & IP_OUTARGS) {
 859                                                 ipoa->ipoa_retflags |=
 860                                                     IPOARF_IFDENIED;
 861                                         }
 862                                 } else {
 863                                         RT_UNLOCK(ro->ro_rt);
 864                                 }
 865                         }
 866                 }
 867
 868                 if (ro->ro_rt == NULL) {
 869                         OSAddAtomic(1, &ipstat.ips_noroute);
 870                         error = EHOSTUNREACH;
 871                         if (ia0 != NULL) {
 872                                 IFA_REMREF(ia0);
 873                                 ia0 = NULL;
 874                         }
 875                         goto bad;
 876                 }
 877
 878                 if (ia != NULL) {
 879                         IFA_REMREF(&ia->ia_ifa);
 880                 }
 881                 RT_LOCK_SPIN(ro->ro_rt);
 882                 ia = ifatoia(ro->ro_rt->rt_ifa);
 883                 if (ia != NULL) {
 884                         /* Become a regular mutex */
 885                         RT_CONVERT_LOCK(ro->ro_rt);
 886                         IFA_ADDREF(&ia->ia_ifa);
 887                 }
 888                 /*
 889                  * Note: ia_ifp may not be the same as rt_ifp; the latter
 890                  * is what we use for determining outbound i/f, mtu, etc.
 891                  */
 892                 ifp = ro->ro_rt->rt_ifp;
 893                 ro->ro_rt->rt_use++;
 894                 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
 895                         dst = SIN(ro->ro_rt->rt_gateway);
 896                 }
 897                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 898                         /* double negation needed for bool bit field */
 899                         ipobf.isbroadcast =
 900                             !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
 901                 } else {
 902                         /* Become a regular mutex */
 903                         RT_CONVERT_LOCK(ro->ro_rt);
 904                         ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 905                 }
 906                 /*
 907                  * For consistency with IPv6, as well as to ensure that
 908                  * IP_RECVIF is set correctly for packets that are sent
 909                  * to one of the local addresses.  ia (rt_ifa) would have
 910                  * been fixed up by rt_setif for local routes.  This
 911                  * would make it appear as if the packet arrives on the
 912                  * interface which owns the local address.  Loopback
 913                  * multicast case is handled separately by ip_mloopback().
 914                  */
 915                 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
 916                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 917                         uint32_t srcidx;
 918
 919                         m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
 920
 921                         if (ia0 != NULL) {
 922                                 srcidx = ia0->ifa_ifp->if_index;
 923                         } else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
 924                             ro->ro_srcia != NULL) {
 925                                 srcidx = ro->ro_srcia->ifa_ifp->if_index;
 926                         } else {
 927                                 srcidx = 0;
 928                         }
 929
 930                         ip_setsrcifaddr_info(m, srcidx, NULL);
 931                         ip_setdstifaddr_info(m, 0, ia);
 932                 }
 933                 RT_UNLOCK(ro->ro_rt);
 934                 if (ia0 != NULL) {
 935                         IFA_REMREF(ia0);
 936                         ia0 = NULL;
 937                 }
 938         }
 939
 940         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 941                 struct ifnet *srcifp = NULL;
 942                 struct in_multi *inm;
 943                 u_int32_t vif = 0;
 944                 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
 945                 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 946
 947                 m->m_flags |= M_MCAST;
 948                 /*
 949                  * IP destination address is multicast.  Make sure "dst"
 950                  * still points to the address in "ro".  (It may have been
 951                  * changed to point to a gateway address, above.)
 952                  */
 953                 dst = SIN(&ro->ro_dst);
 954                 /*
 955                  * See if the caller provided any multicast options
 956                  */
 957                 if (imo != NULL) {
 958                         IMO_LOCK(imo);
 959                         vif = imo->imo_multicast_vif;
 960                         ttl = imo->imo_multicast_ttl;
 961                         loop = imo->imo_multicast_loop;
 962                         if (!(flags & IP_RAWOUTPUT)) {
 963                                 ip->ip_ttl = ttl;
 964                         }
 965                         if (imo->imo_multicast_ifp != NULL) {
 966                                 ifp = imo->imo_multicast_ifp;
 967                         }
 968                         IMO_UNLOCK(imo);
 969                 } else if (!(flags & IP_RAWOUTPUT)) {
 970                         vif = -1;
 971                         ip->ip_ttl = ttl;
 972                 }
 973                 /*
 974                  * Confirm that the outgoing interface supports multicast.
 975                  */
 976                 if (imo == NULL || vif == -1) {
 977                         if (!(ifp->if_flags & IFF_MULTICAST)) {
 978                                 OSAddAtomic(1, &ipstat.ips_noroute);
 979                                 error = ENETUNREACH;
 980                                 goto bad;
 981                         }
 982                 }
 983                 /*
 984                  * If source address not specified yet, use address
 985                  * of outgoing interface.
 986                  */
 987                 if (ip->ip_src.s_addr == INADDR_ANY) {
 988                         struct in_ifaddr *ia1;
 989                         lck_rw_lock_shared(in_ifaddr_rwlock);
 990                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
 991                                 IFA_LOCK_SPIN(&ia1->ia_ifa);
 992                                 if (ia1->ia_ifp == ifp) {
 993                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 994                                         srcifp = ifp;
 995                                         IFA_UNLOCK(&ia1->ia_ifa);
 996                                         break;
 997                                 }
 998                                 IFA_UNLOCK(&ia1->ia_ifa);
 999                         }
1000                         lck_rw_done(in_ifaddr_rwlock);
1001                         if (ip->ip_src.s_addr == INADDR_ANY) {
1002                                 error = ENETUNREACH;
1003                                 goto bad;
1004                         }
1005                 }
1006
1007                 in_multihead_lock_shared();
1008                 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
1009                 in_multihead_lock_done();
1010                 if (inm != NULL && (imo == NULL || loop)) {
1011                         /*
1012                          * If we belong to the destination multicast group
1013                          * on the outgoing interface, and the caller did not
1014                          * forbid loopback, loop back a copy.
1015                          */
1016                         if (!TAILQ_EMPTY(&ipv4_filters)
1017 #if NECP
1018                             && !necp_packet_should_skip_filters(m)
1019 #endif // NECP
1020                             ) {
1021                                 struct ipfilter *filter;
1022                                 int seen = (inject_filter_ref == NULL);
1023
1024                                 if (imo != NULL) {
1025                                         ipf_pktopts.ippo_flags |=
1026                                             IPPOF_MCAST_OPTS;
1027                                         ipf_pktopts.ippo_mcast_ifnet = ifp;
1028                                         ipf_pktopts.ippo_mcast_ttl = ttl;
1029                                         ipf_pktopts.ippo_mcast_loop = loop;
1030                                 }
1031
1032                                 ipf_ref();
1033
1034                                 /*
1035                                  * 4135317 - always pass network byte
1036                                  * order to filter
1037                                  */
1038 #if BYTE_ORDER != BIG_ENDIAN
1039                                 HTONS(ip->ip_len);
1040                                 HTONS(ip->ip_off);
1041 #endif
1042                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1043                                         if (seen == 0) {
1044                                                 if ((struct ipfilter *)
1045                                                     inject_filter_ref == filter) {
1046                                                         seen = 1;
1047                                                 }
1048                                         } else if (filter->ipf_filter.
1049                                             ipf_output != NULL) {
1050                                                 errno_t result;
1051                                                 result = filter->ipf_filter.
1052                                                     ipf_output(filter->
1053                                                     ipf_filter.cookie,
1054                                                     (mbuf_t *)&m, ippo);
1055                                                 if (result == EJUSTRETURN) {
1056                                                         ipf_unref();
1057                                                         INM_REMREF(inm);
1058                                                         goto done;
1059                                                 }
1060                                                 if (result != 0) {
1061                                                         ipf_unref();
1062                                                         INM_REMREF(inm);
1063                                                         goto bad;
1064                                                 }
1065                                         }
1066                                 }
1067
1068                                 /* set back to host byte order */
1069                                 ip = mtod(m, struct ip *);
1070 #if BYTE_ORDER != BIG_ENDIAN
1071                                 NTOHS(ip->ip_len);
1072                                 NTOHS(ip->ip_off);
1073 #endif
1074                                 ipf_unref();
1075                                 ipobf.didfilter = TRUE;
1076                         }
1077                         ip_mloopback(srcifp, ifp, m, dst, hlen);
1078                 }
1079                 if (inm != NULL) {
1080                         INM_REMREF(inm);
1081                 }
1082                 /*
1083                  * Multicasts with a time-to-live of zero may be looped-
1084                  * back, above, but must not be transmitted on a network.
1085                  * Also, multicasts addressed to the loopback interface
1086                  * are not sent -- the above call to ip_mloopback() will
1087                  * loop back a copy if this host actually belongs to the
1088                  * destination group on the loopback interface.
1089                  */
1090                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1091                         m_freem(m);
1092                         goto done;
1093                 }
1094
1095                 goto sendit;
1096         }
1097         /*
1098          * If source address not specified yet, use address
1099          * of outgoing interface.
1100          */
1101         if (ip->ip_src.s_addr == INADDR_ANY) {
1102                 IFA_LOCK_SPIN(&ia->ia_ifa);
1103                 ip->ip_src = IA_SIN(ia)->sin_addr;
1104                 IFA_UNLOCK(&ia->ia_ifa);
1105 #if IPFIREWALL_FORWARD
1106                 /*
1107                  * Keep note that we did this - if the firewall changes
1108                  * the next-hop, our interface may change, changing the
1109                  * default source IP. It's a shame so much effort happens
1110                  * twice. Oh well.
1111                  */
1112                 ipobf.fwd_rewrite_src = TRUE;
1113 #endif /* IPFIREWALL_FORWARD */
1114         }
1115
1116         /*
1117          * Look for broadcast address and
1118          * and verify user is allowed to send
1119          * such a packet.
1120          */
1121         if (ipobf.isbroadcast) {
1122                 if (!(ifp->if_flags & IFF_BROADCAST)) {
1123                         error = EADDRNOTAVAIL;
1124                         goto bad;
1125                 }
1126                 if (!(flags & IP_ALLOWBROADCAST)) {
1127                         error = EACCES;
1128                         goto bad;
1129                 }
1130                 /* don't allow broadcast messages to be fragmented */
1131                 if ((u_short)ip->ip_len > ifp->if_mtu) {
1132                         error = EMSGSIZE;
1133                         goto bad;
1134                 }
1135                 m->m_flags |= M_BCAST;
1136         } else {
1137                 m->m_flags &= ~M_BCAST;
1138         }
1139
1140 sendit:
1141 #if PF
1142         /* Invoke outbound packet filter */
1143         if (PF_IS_ENABLED) {
1144                 int rc;
1145
1146                 m0 = m; /* Save for later */
1147 #if DUMMYNET
1148                 args.fwa_m = m;
1149                 args.fwa_next_hop = dst;
1150                 args.fwa_oif = ifp;
1151                 args.fwa_ro = ro;
1152                 args.fwa_dst = dst;
1153                 args.fwa_oflags = flags;
1154                 if (flags & IP_OUTARGS) {
1155                         args.fwa_ipoa = ipoa;
1156                 }
1157                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args);
1158 #else /* DUMMYNET */
1159                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1160 #endif /* DUMMYNET */
1161                 if (rc != 0 || m == NULL) {
1162                         /* Move to the next packet */
1163                         m = *mppn;
1164
1165                         /* Skip ahead if first packet in list got dropped */
1166                         if (packetlist == m0) {
1167                                 packetlist = m;
1168                         }
1169
1170                         if (m != NULL) {
1171                                 m0 = m;
1172                                 /* Next packet in the chain */
1173                                 goto loopit;
1174                         } else if (packetlist != NULL) {
1175                                 /* No more packet; send down the chain */
1176                                 goto sendchain;
1177                         }
1178                         /* Nothing left; we're done */
1179                         goto done;
1180                 }
1181                 m0 = m;
1182                 ip = mtod(m, struct ip *);
1183                 pkt_dst = ip->ip_dst;
1184                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1185         }
1186 #endif /* PF */
1187         /*
1188          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1189          */
1190         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1191             IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1192                 ip_linklocal_stat.iplls_out_total++;
1193                 if (ip->ip_ttl != MAXTTL) {
1194                         ip_linklocal_stat.iplls_out_badttl++;
1195                         ip->ip_ttl = MAXTTL;
1196                 }
1197         }
1198
1199         if (!ipobf.didfilter &&
1200             !TAILQ_EMPTY(&ipv4_filters)
1201 #if NECP
1202             && !necp_packet_should_skip_filters(m)
1203 #endif // NECP
1204             ) {
1205                 struct ipfilter *filter;
1206                 int seen = (inject_filter_ref == NULL);
1207                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1208
1209                 /*
1210                  * Check that a TSO frame isn't passed to a filter.
1211                  * This could happen if a filter is inserted while
1212                  * TCP is sending the TSO packet.
1213                  */
1214                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1215                         error = EMSGSIZE;
1216                         goto bad;
1217                 }
1218
1219                 ipf_ref();
1220
1221                 /* 4135317 - always pass network byte order to filter */
1222 #if BYTE_ORDER != BIG_ENDIAN
1223                 HTONS(ip->ip_len);
1224                 HTONS(ip->ip_off);
1225 #endif
1226                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1227                         if (seen == 0) {
1228                                 if ((struct ipfilter *)inject_filter_ref ==
1229                                     filter) {
1230                                         seen = 1;
1231                                 }
1232                         } else if (filter->ipf_filter.ipf_output) {
1233                                 errno_t result;
1234                                 result = filter->ipf_filter.
1235                                     ipf_output(filter->ipf_filter.cookie,
1236                                     (mbuf_t *)&m, ippo);
1237                                 if (result == EJUSTRETURN) {
1238                                         ipf_unref();
1239                                         goto done;
1240                                 }
1241                                 if (result != 0) {
1242                                         ipf_unref();
1243                                         goto bad;
1244                                 }
1245                         }
1246                 }
1247                 /* set back to host byte order */
1248                 ip = mtod(m, struct ip *);
1249 #if BYTE_ORDER != BIG_ENDIAN
1250                 NTOHS(ip->ip_len);
1251                 NTOHS(ip->ip_off);
1252 #endif
1253                 ipf_unref();
1254         }
1255
1256 #if NECP
1257         /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1258         necp_matched_policy_id = necp_ip_output_find_policy_match(m,
1259             flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter);
1260         if (necp_matched_policy_id) {
1261                 necp_mark_packet_from_ip(m, necp_matched_policy_id);
1262                 switch (necp_result) {
1263                 case NECP_KERNEL_POLICY_RESULT_PASS:
1264                         /* Check if the interface is allowed */
1265                         if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1266                                 error = EHOSTUNREACH;
1267                                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1268                                 goto bad;
1269                         }
1270                         goto skip_ipsec;
1271                 case NECP_KERNEL_POLICY_RESULT_DROP:
1272                 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1273                         /* Flow divert packets should be blocked at the IP layer */
1274                         error = EHOSTUNREACH;
1275                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1276                         goto bad;
1277                 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1278                         /* Verify that the packet is being routed to the tunnel */
1279                         struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
1280                         if (policy_ifp == ifp) {
1281                                 /* Check if the interface is allowed */
1282                                 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1283                                         error = EHOSTUNREACH;
1284                                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1285                                         goto bad;
1286                                 }
1287                                 goto skip_ipsec;
1288                         } else {
1289                                 if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
1290                                         /* Check if the interface is allowed */
1291                                         if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
1292                                                 error = EHOSTUNREACH;
1293                                                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1294                                                 goto bad;
1295                                         }
1296
1297                                         /* Set ifp to the tunnel interface, since it is compatible with the packet */
1298                                         ifp = policy_ifp;
1299                                         ro = &necp_route;
1300                                         goto skip_ipsec;
1301                                 } else {
1302                                         error = ENETUNREACH;
1303                                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1304                                         goto bad;
1305                                 }
1306                         }
1307                 }
1308                 default:
1309                         break;
1310                 }
1311         }
1312         /* Catch-all to check if the interface is allowed */
1313         if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1314                 error = EHOSTUNREACH;
1315                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1316                 goto bad;
1317         }
1318 #endif /* NECP */
1319
1320 #if IPSEC
1321         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) {
1322                 goto skip_ipsec;
1323         }
1324
1325         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1326
1327         if (sp == NULL) {
1328                 /* get SP for this packet */
1329                 if (so != NULL) {
1330                         sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1331                             so, &error);
1332                 } else {
1333                         sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1334                             flags, &error);
1335                 }
1336                 if (sp == NULL) {
1337                         IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1338                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1339                             0, 0, 0, 0, 0);
1340                         goto bad;
1341                 }
1342         }
1343
1344         error = 0;
1345
1346         /* check policy */
1347         switch (sp->policy) {
1348         case IPSEC_POLICY_DISCARD:
1349         case IPSEC_POLICY_GENERATE:
1350                 /*
1351                  * This packet is just discarded.
1352                  */
1353                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1354                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1355                     1, 0, 0, 0, 0);
1356                 goto bad;
1357
1358         case IPSEC_POLICY_BYPASS:
1359         case IPSEC_POLICY_NONE:
1360                 /* no need to do IPsec. */
1361                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1362                     2, 0, 0, 0, 0);
1363                 goto skip_ipsec;
1364
1365         case IPSEC_POLICY_IPSEC:
1366                 if (sp->req == NULL) {
1367                         /* acquire a policy */
1368                         error = key_spdacquire(sp);
1369                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1370                             3, 0, 0, 0, 0);
1371                         goto bad;
1372                 }
1373                 if (sp->ipsec_if) {
1374                         /* Verify the redirect to ipsec interface */
1375                         if (sp->ipsec_if == ifp) {
1376                                 goto skip_ipsec;
1377                         }
1378                         goto bad;
1379                 }
1380                 break;
1381
1382         case IPSEC_POLICY_ENTRUST:
1383         default:
1384                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1385         }
1386         {
1387                 ipsec_state.m = m;
1388                 if (flags & IP_ROUTETOIF) {
1389                         bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1390                 } else {
1391                         route_copyout((struct route *)&ipsec_state.ro, ro, sizeof(struct route));
1392                 }
1393                 ipsec_state.dst = SA(dst);
1394
1395                 ip->ip_sum = 0;
1396
1397                 /*
1398                  * XXX
1399                  * delayed checksums are not currently compatible with IPsec
1400                  */
1401                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1402                         in_delayed_cksum(m);
1403                 }
1404
1405 #if BYTE_ORDER != BIG_ENDIAN
1406                 HTONS(ip->ip_len);
1407                 HTONS(ip->ip_off);
1408 #endif
1409
1410                 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1411                     struct ip *, ip, struct ifnet *, ifp,
1412                     struct ip *, ip, struct ip6_hdr *, NULL);
1413
1414                 error = ipsec4_output(&ipsec_state, sp, flags);
1415                 if (ipsec_state.tunneled == 6) {
1416                         m0 = m = NULL;
1417                         error = 0;
1418                         goto bad;
1419                 }
1420
1421                 m0 = m = ipsec_state.m;
1422
1423 #if DUMMYNET
1424                 /*
1425                  * If we're about to use the route in ipsec_state
1426                  * and this came from dummynet, cleaup now.
1427                  */
1428                 if (ro == &saved_route &&
1429                     (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) {
1430                         ROUTE_RELEASE(ro);
1431                 }
1432 #endif /* DUMMYNET */
1433
1434                 if (flags & IP_ROUTETOIF) {
1435                         /*
1436                          * if we have tunnel mode SA, we may need to ignore
1437                          * IP_ROUTETOIF.
1438                          */
1439                         if (ipsec_state.tunneled) {
1440                                 flags &= ~IP_ROUTETOIF;
1441                                 ro = (struct route *)&ipsec_state.ro;
1442                         }
1443                 } else {
1444                         ro = (struct route *)&ipsec_state.ro;
1445                 }
1446                 dst = SIN(ipsec_state.dst);
1447                 if (error) {
1448                         /* mbuf is already reclaimed in ipsec4_output. */
1449                         m0 = NULL;
1450                         switch (error) {
1451                         case EHOSTUNREACH:
1452                         case ENETUNREACH:
1453                         case EMSGSIZE:
1454                         case ENOBUFS:
1455                         case ENOMEM:
1456                                 break;
1457                         default:
1458                                 printf("ip4_output (ipsec): error code %d\n", error);
1459                         /* FALLTHRU */
1460                         case ENOENT:
1461                                 /* don't show these error codes to the user */
1462                                 error = 0;
1463                                 break;
1464                         }
1465                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1466                             4, 0, 0, 0, 0);
1467                         goto bad;
1468                 }
1469         }
1470
1471         /* be sure to update variables that are affected by ipsec4_output() */
1472         ip = mtod(m, struct ip *);
1473
1474 #ifdef _IP_VHL
1475         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1476 #else /* !_IP_VHL */
1477         hlen = ip->ip_hl << 2;
1478 #endif /* !_IP_VHL */
1479         /* Check that there wasn't a route change and src is still valid */
1480         if (ROUTE_UNUSABLE(ro)) {
1481                 ROUTE_RELEASE(ro);
1482                 VERIFY(src_ia == NULL);
1483                 if (ip->ip_src.s_addr != INADDR_ANY &&
1484                     !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1485                     (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1486                         error = EADDRNOTAVAIL;
1487                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1488                             5, 0, 0, 0, 0);
1489                         goto bad;
1490                 }
1491                 if (src_ia != NULL) {
1492                         IFA_REMREF(&src_ia->ia_ifa);
1493                         src_ia = NULL;
1494                 }
1495         }
1496
1497         if (ro->ro_rt == NULL) {
1498                 if (!(flags & IP_ROUTETOIF)) {
1499                         printf("%s: can't update route after "
1500                             "IPsec processing\n", __func__);
1501                         error = EHOSTUNREACH;   /* XXX */
1502                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1503                             6, 0, 0, 0, 0);
1504                         goto bad;
1505                 }
1506         } else {
1507                 if (ia != NULL) {
1508                         IFA_REMREF(&ia->ia_ifa);
1509                 }
1510                 RT_LOCK_SPIN(ro->ro_rt);
1511                 ia = ifatoia(ro->ro_rt->rt_ifa);
1512                 if (ia != NULL) {
1513                         /* Become a regular mutex */
1514                         RT_CONVERT_LOCK(ro->ro_rt);
1515                         IFA_ADDREF(&ia->ia_ifa);
1516                 }
1517                 ifp = ro->ro_rt->rt_ifp;
1518                 RT_UNLOCK(ro->ro_rt);
1519         }
1520
1521         /* make it flipped, again. */
1522 #if BYTE_ORDER != BIG_ENDIAN
1523         NTOHS(ip->ip_len);
1524         NTOHS(ip->ip_off);
1525 #endif
1526         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1527             7, 0xff, 0xff, 0xff, 0xff);
1528
1529         /* Pass to filters again */
1530         if (!TAILQ_EMPTY(&ipv4_filters)
1531 #if NECP
1532             && !necp_packet_should_skip_filters(m)
1533 #endif // NECP
1534             ) {
1535                 struct ipfilter *filter;
1536
1537                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1538
1539                 /*
1540                  * Check that a TSO frame isn't passed to a filter.
1541                  * This could happen if a filter is inserted while
1542                  * TCP is sending the TSO packet.
1543                  */
1544                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1545                         error = EMSGSIZE;
1546                         goto bad;
1547                 }
1548
1549                 ipf_ref();
1550
1551                 /* 4135317 - always pass network byte order to filter */
1552 #if BYTE_ORDER != BIG_ENDIAN
1553                 HTONS(ip->ip_len);
1554                 HTONS(ip->ip_off);
1555 #endif
1556                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1557                         if (filter->ipf_filter.ipf_output) {
1558                                 errno_t result;
1559                                 result = filter->ipf_filter.
1560                                     ipf_output(filter->ipf_filter.cookie,
1561                                     (mbuf_t *)&m, ippo);
1562                                 if (result == EJUSTRETURN) {
1563                                         ipf_unref();
1564                                         goto done;
1565                                 }
1566                                 if (result != 0) {
1567                                         ipf_unref();
1568                                         goto bad;
1569                                 }
1570                         }
1571                 }
1572                 /* set back to host byte order */
1573                 ip = mtod(m, struct ip *);
1574 #if BYTE_ORDER != BIG_ENDIAN
1575                 NTOHS(ip->ip_len);
1576                 NTOHS(ip->ip_off);
1577 #endif
1578                 ipf_unref();
1579         }
1580 skip_ipsec:
1581 #endif /* IPSEC */
1582
1583 #if IPFIREWALL
1584         /*
1585          * Check with the firewall...
1586          * but not if we are already being fwd'd from a firewall.
1587          */
1588         if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) {
1589                 struct sockaddr_in *old = dst;
1590
1591                 args.fwa_m = m;
1592                 args.fwa_next_hop = dst;
1593                 args.fwa_oif = ifp;
1594                 ipfwoff = ip_fw_chk_ptr(&args);
1595                 m = args.fwa_m;
1596                 dst = args.fwa_next_hop;
1597
1598                 /*
1599                  * On return we must do the following:
1600                  *   IP_FW_PORT_DENY_FLAG         -> drop the pkt (XXX new)
1601                  *   1<=off<= 0xffff              -> DIVERT
1602                  *   (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1603                  *   (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
1604                  *   dst != old                   -> IPFIREWALL_FORWARD
1605                  *   off==0, dst==old             -> accept
1606                  * If some of the above modules is not compiled in, then
1607                  * we should't have to check the corresponding condition
1608                  * (because the ipfw control socket should not accept
1609                  * unsupported rules), but better play safe and drop
1610                  * packets in case of doubt.
1611                  */
1612                 m0 = m;
1613                 if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1614                         if (m) {
1615                                 m_freem(m);
1616                         }
1617                         error = EACCES;
1618                         goto done;
1619                 }
1620                 ip = mtod(m, struct ip *);
1621
1622                 if (ipfwoff == 0 && dst == old) {       /* common case */
1623                         goto pass;
1624                 }
1625 #if DUMMYNET
1626                 if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) {
1627                         /*
1628                          * pass the pkt to dummynet. Need to include
1629                          * pipe number, m, ifp, ro, dst because these are
1630                          * not recomputed in the next pass.
1631                          * All other parameters have been already used and
1632                          * so they are not needed anymore.
1633                          * XXX note: if the ifp or ro entry are deleted
1634                          * while a pkt is in dummynet, we are in trouble!
1635                          */
1636                         args.fwa_ro = ro;
1637                         args.fwa_dst = dst;
1638                         args.fwa_oflags = flags;
1639                         if (flags & IP_OUTARGS) {
1640                                 args.fwa_ipoa = ipoa;
1641                         }
1642
1643                         error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT,
1644                             &args, DN_CLIENT_IPFW);
1645                         goto done;
1646                 }
1647 #endif /* DUMMYNET */
1648 #if IPDIVERT
1649                 if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) {
1650                         struct mbuf *clone = NULL;
1651
1652                         /* Clone packet if we're doing a 'tee' */
1653                         if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0) {
1654                                 clone = m_dup(m, M_DONTWAIT);
1655                         }
1656                         /*
1657                          * XXX
1658                          * delayed checksums are not currently compatible
1659                          * with divert sockets.
1660                          */
1661                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1662                                 in_delayed_cksum(m);
1663                         }
1664
1665                         /* Restore packet header fields to original values */
1666
1667 #if BYTE_ORDER != BIG_ENDIAN
1668                         HTONS(ip->ip_len);
1669                         HTONS(ip->ip_off);
1670 #endif
1671
1672                         /* Deliver packet to divert input routine */
1673                         divert_packet(m, 0, ipfwoff & 0xffff,
1674                             args.fwa_divert_rule);
1675
1676                         /* If 'tee', continue with original packet */
1677                         if (clone != NULL) {
1678                                 m0 = m = clone;
1679                                 ip = mtod(m, struct ip *);
1680                                 goto pass;
1681                         }
1682                         goto done;
1683                 }
1684 #endif /* IPDIVERT */
1685 #if IPFIREWALL_FORWARD
1686                 /*
1687                  * Here we check dst to make sure it's directly reachable on
1688                  * the interface we previously thought it was.
1689                  * If it isn't (which may be likely in some situations) we have
1690                  * to re-route it (ie, find a route for the next-hop and the
1691                  * associated interface) and set them here. This is nested
1692                  * forwarding which in most cases is undesirable, except where
1693                  * such control is nigh impossible. So we do it here.
1694                  * And I'm babbling.
1695                  */
1696                 if (ipfwoff == 0 && old != dst) {
1697                         struct in_ifaddr *ia_fw;
1698                         struct route *ro_fwd = &sro_fwd;
1699
1700 #if IPFIREWALL_FORWARD_DEBUG
1701                         printf("IPFIREWALL_FORWARD: New dst ip: ");
1702                         print_ip(dst->sin_addr);
1703                         printf("\n");
1704 #endif /* IPFIREWALL_FORWARD_DEBUG */
1705                         /*
1706                          * We need to figure out if we have been forwarded
1707                          * to a local socket. If so then we should somehow
1708                          * "loop back" to ip_input, and get directed to the
1709                          * PCB as if we had received this packet. This is
1710                          * because it may be dificult to identify the packets
1711                          * you want to forward until they are being output
1712                          * and have selected an interface. (e.g. locally
1713                          * initiated packets) If we used the loopback inteface,
1714                          * we would not be able to control what happens
1715                          * as the packet runs through ip_input() as
1716                          * it is done through a ISR.
1717                          */
1718                         lck_rw_lock_shared(in_ifaddr_rwlock);
1719                         TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1720                                 /*
1721                                  * If the addr to forward to is one
1722                                  * of ours, we pretend to
1723                                  * be the destination for this packet.
1724                                  */
1725                                 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1726                                 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1727                                     dst->sin_addr.s_addr) {
1728                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1729                                         break;
1730                                 }
1731                                 IFA_UNLOCK(&ia_fw->ia_ifa);
1732                         }
1733                         lck_rw_done(in_ifaddr_rwlock);
1734                         if (ia_fw) {
1735                                 /* tell ip_input "dont filter" */
1736                                 struct m_tag            *fwd_tag;
1737                                 struct ip_fwd_tag       *ipfwd_tag;
1738
1739                                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1740                                     KERNEL_TAG_TYPE_IPFORWARD,
1741                                     sizeof(*ipfwd_tag), M_NOWAIT, m);
1742                                 if (fwd_tag == NULL) {
1743                                         error = ENOBUFS;
1744                                         goto bad;
1745                                 }
1746
1747                                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag + 1);
1748                                 ipfwd_tag->next_hop = args.fwa_next_hop;
1749
1750                                 m_tag_prepend(m, fwd_tag);
1751
1752                                 if (m->m_pkthdr.rcvif == NULL) {
1753                                         m->m_pkthdr.rcvif = lo_ifp;
1754                                 }
1755
1756 #if BYTE_ORDER != BIG_ENDIAN
1757                                 HTONS(ip->ip_len);
1758                                 HTONS(ip->ip_off);
1759 #endif
1760                                 mbuf_outbound_finalize(m, PF_INET, 0);
1761
1762                                 /*
1763                                  * we need to call dlil_output to run filters
1764                                  * and resync to avoid recursion loops.
1765                                  */
1766                                 if (lo_ifp) {
1767                                         dlil_output(lo_ifp, PF_INET, m, NULL,
1768                                             SA(dst), 0, adv);
1769                                 } else {
1770                                         printf("%s: no loopback ifp for "
1771                                             "forwarding!!!\n", __func__);
1772                                 }
1773                                 goto done;
1774                         }
1775                         /*
1776                          * Some of the logic for this was nicked from above.
1777                          *
1778                          * This rewrites the cached route in a local PCB.
1779                          * Is this what we want to do?
1780                          */
1781                         ROUTE_RELEASE(ro_fwd);
1782                         bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
1783
1784                         rtalloc_ign(ro_fwd, RTF_PRCLONING, false);
1785
1786                         if (ro_fwd->ro_rt == NULL) {
1787                                 OSAddAtomic(1, &ipstat.ips_noroute);
1788                                 error = EHOSTUNREACH;
1789                                 goto bad;
1790                         }
1791
1792                         RT_LOCK_SPIN(ro_fwd->ro_rt);
1793                         ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1794                         if (ia_fw != NULL) {
1795                                 /* Become a regular mutex */
1796                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1797                                 IFA_ADDREF(&ia_fw->ia_ifa);
1798                         }
1799                         ifp = ro_fwd->ro_rt->rt_ifp;
1800                         ro_fwd->ro_rt->rt_use++;
1801                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) {
1802                                 dst = SIN(ro_fwd->ro_rt->rt_gateway);
1803                         }
1804                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1805                                 /* double negation needed for bool bit field */
1806                                 ipobf.isbroadcast =
1807                                     !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1808                         } else {
1809                                 /* Become a regular mutex */
1810                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1811                                 ipobf.isbroadcast =
1812                                     in_broadcast(dst->sin_addr, ifp);
1813                         }
1814                         RT_UNLOCK(ro_fwd->ro_rt);
1815                         ROUTE_RELEASE(ro);
1816                         ro->ro_rt = ro_fwd->ro_rt;
1817                         ro_fwd->ro_rt = NULL;
1818                         dst = SIN(&ro_fwd->ro_dst);
1819
1820                         /*
1821                          * If we added a default src ip earlier,
1822                          * which would have been gotten from the-then
1823                          * interface, do it again, from the new one.
1824                          */
1825                         if (ia_fw != NULL) {
1826                                 if (ipobf.fwd_rewrite_src) {
1827                                         IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1828                                         ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1829                                         IFA_UNLOCK(&ia_fw->ia_ifa);
1830                                 }
1831                                 IFA_REMREF(&ia_fw->ia_ifa);
1832                         }
1833                         goto pass;
1834                 }
1835 #endif /* IPFIREWALL_FORWARD */
1836                 /*
1837                  * if we get here, none of the above matches, and
1838                  * we have to drop the pkt
1839                  */
1840                 m_freem(m);
1841                 error = EACCES; /* not sure this is the right error msg */
1842                 goto done;
1843         }
1844
1845 pass:
1846 #endif /* IPFIREWALL */
1847
1848         /* 127/8 must not appear on wire - RFC1122 */
1849         if (!(ifp->if_flags & IFF_LOOPBACK) &&
1850             ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1851             (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1852                 OSAddAtomic(1, &ipstat.ips_badaddr);
1853                 error = EADDRNOTAVAIL;
1854                 goto bad;
1855         }
1856
1857         if (ipoa != NULL) {
1858                 u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1859
1860                 error = set_packet_qos(m, ifp,
1861                     ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1862                     ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1863                 if (error == 0) {
1864                         ip->ip_tos &= IPTOS_ECN_MASK;
1865                         ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
1866                 } else {
1867                         printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1868                         error = 0;
1869                 }
1870         }
1871
1872         ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1873             ip->ip_len, &sw_csum);
1874
1875         interface_mtu = ifp->if_mtu;
1876
1877         if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1878                 interface_mtu = IN6_LINKMTU(ifp);
1879                 /* Further adjust the size for CLAT46 expansion */
1880                 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1881         }
1882
1883         /*
1884          * If small enough for interface, or the interface will take
1885          * care of the fragmentation for us, can just send directly.
1886          */
1887         if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1888             (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1889 #if BYTE_ORDER != BIG_ENDIAN
1890                 HTONS(ip->ip_len);
1891                 HTONS(ip->ip_off);
1892 #endif
1893
1894                 ip->ip_sum = 0;
1895                 if (sw_csum & CSUM_DELAY_IP) {
1896                         ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1897                         sw_csum &= ~CSUM_DELAY_IP;
1898                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1899                 }
1900
1901 #if IPSEC
1902                 /* clean ipsec history once it goes out of the node */
1903                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1904                         ipsec_delaux(m);
1905                 }
1906 #endif /* IPSEC */
1907                 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1908                     (m->m_pkthdr.tso_segsz > 0)) {
1909                         scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1910                 } else {
1911                         scnt++;
1912                 }
1913
1914                 if (packetchain == 0) {
1915                         if (ro->ro_rt != NULL && nstat_collect) {
1916                                 nstat_route_tx(ro->ro_rt, scnt,
1917                                     m->m_pkthdr.len, 0);
1918                         }
1919
1920                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1921                             SA(dst), 0, adv);
1922                         if (dlil_verbose && error) {
1923                                 printf("dlil_output error on interface %s: %d\n",
1924                                     ifp->if_xname, error);
1925                         }
1926                         scnt = 0;
1927                         goto done;
1928                 } else {
1929                         /*
1930                          * packet chaining allows us to reuse the
1931                          * route for all packets
1932                          */
1933                         bytecnt += m->m_pkthdr.len;
1934                         mppn = &m->m_nextpkt;
1935                         m = m->m_nextpkt;
1936                         if (m == NULL) {
1937 #if PF
1938 sendchain:
1939 #endif /* PF */
1940                                 if (pktcnt > ip_maxchainsent) {
1941                                         ip_maxchainsent = pktcnt;
1942                                 }
1943                                 if (ro->ro_rt != NULL && nstat_collect) {
1944                                         nstat_route_tx(ro->ro_rt, scnt,
1945                                             bytecnt, 0);
1946                                 }
1947
1948                                 error = dlil_output(ifp, PF_INET, packetlist,
1949                                     ro->ro_rt, SA(dst), 0, adv);
1950                                 if (dlil_verbose && error) {
1951                                         printf("dlil_output error on interface %s: %d\n",
1952                                             ifp->if_xname, error);
1953                                 }
1954                                 pktcnt = 0;
1955                                 scnt = 0;
1956                                 bytecnt = 0;
1957                                 goto done;
1958                         }
1959                         m0 = m;
1960                         pktcnt++;
1961                         goto loopit;
1962                 }
1963         }
1964
1965         VERIFY(interface_mtu != 0);
1966         /*
1967          * Too large for interface; fragment if possible.
1968          * Must be able to put at least 8 bytes per fragment.
1969          * Balk when DF bit is set or the interface didn't support TSO.
1970          */
1971         if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1972             (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1973                 error = EMSGSIZE;
1974                 /*
1975                  * This case can happen if the user changed the MTU
1976                  * of an interface after enabling IP on it.  Because
1977                  * most netifs don't keep track of routes pointing to
1978                  * them, there is no way for one to update all its
1979                  * routes when the MTU is changed.
1980                  */
1981                 if (ro->ro_rt) {
1982                         RT_LOCK_SPIN(ro->ro_rt);
1983                         if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1984                             !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1985                             (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1986                                 ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1987                         }
1988                         RT_UNLOCK(ro->ro_rt);
1989                 }
1990                 if (pktcnt > 0) {
1991                         m0 = packetlist;
1992                 }
1993                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1994                 goto bad;
1995         }
1996
1997         /*
1998          * XXX Only TCP seems to be passing a list of packets here.
1999          * The following issue is limited to UDP datagrams with 0 checksum.
2000          * For now limit it to the case when single packet is passed down.
2001          */
2002         if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
2003                 /*
2004                  * If it is a UDP packet that has checksum set to 0
2005                  * and is also not being offloaded, compute a full checksum
2006                  * and update the UDP checksum.
2007                  */
2008                 if (ip->ip_p == IPPROTO_UDP &&
2009                     !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
2010                         struct udphdr *uh = NULL;
2011
2012                         if (m->m_len < hlen + sizeof(struct udphdr)) {
2013                                 m = m_pullup(m, hlen + sizeof(struct udphdr));
2014                                 if (m == NULL) {
2015                                         error = ENOBUFS;
2016                                         m0 = m;
2017                                         goto bad;
2018                                 }
2019                                 m0 = m;
2020                                 ip = mtod(m, struct ip *);
2021                         }
2022                         /*
2023                          * Get UDP header and if checksum is 0, then compute the full
2024                          * checksum.
2025                          */
2026                         uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
2027                         if (uh->uh_sum == 0) {
2028                                 uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
2029                                     ip->ip_len - hlen);
2030                                 if (uh->uh_sum == 0) {
2031                                         uh->uh_sum = 0xffff;
2032                                 }
2033                         }
2034                 }
2035         }
2036
2037         error = ip_fragment(m, ifp, interface_mtu, sw_csum);
2038         if (error != 0) {
2039                 m0 = m = NULL;
2040                 goto bad;
2041         }
2042
2043         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
2044             ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
2045
2046         for (m = m0; m; m = m0) {
2047                 m0 = m->m_nextpkt;
2048                 m->m_nextpkt = 0;
2049 #if IPSEC
2050                 /* clean ipsec history once it goes out of the node */
2051                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
2052                         ipsec_delaux(m);
2053                 }
2054 #endif /* IPSEC */
2055                 if (error == 0) {
2056                         if ((packetchain != 0) && (pktcnt > 0)) {
2057                                 panic("%s: mix of packet in packetlist is "
2058                                     "wrong=%p", __func__, packetlist);
2059                                 /* NOTREACHED */
2060                         }
2061                         if (ro->ro_rt != NULL && nstat_collect) {
2062                                 nstat_route_tx(ro->ro_rt, 1,
2063                                     m->m_pkthdr.len, 0);
2064                         }
2065                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
2066                             SA(dst), 0, adv);
2067                         if (dlil_verbose && error) {
2068                                 printf("dlil_output error on interface %s: %d\n",
2069                                     ifp->if_xname, error);
2070                         }
2071                 } else {
2072                         m_freem(m);
2073                 }
2074         }
2075
2076         if (error == 0) {
2077                 OSAddAtomic(1, &ipstat.ips_fragmented);
2078         }
2079
2080 done:
2081         if (ia != NULL) {
2082                 IFA_REMREF(&ia->ia_ifa);
2083                 ia = NULL;
2084         }
2085 #if IPSEC
2086         ROUTE_RELEASE(&ipsec_state.ro);
2087         if (sp != NULL) {
2088                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
2089                     printf("DP ip_output call free SP:%x\n", sp));
2090                 key_freesp(sp, KEY_SADB_UNLOCKED);
2091         }
2092 #endif /* IPSEC */
2093 #if NECP
2094         ROUTE_RELEASE(&necp_route);
2095 #endif /* NECP */
2096 #if DUMMYNET
2097         ROUTE_RELEASE(&saved_route);
2098 #endif /* DUMMYNET */
2099 #if IPFIREWALL_FORWARD
2100         ROUTE_RELEASE(&sro_fwd);
2101 #endif /* IPFIREWALL_FORWARD */
2102
2103         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
2104         if (ip_output_measure) {
2105                 net_perf_measure_time(&net_perf, &start_tv, packets_processed);
2106                 net_perf_histogram(&net_perf, packets_processed);
2107         }
2108         return error;
2109 bad:
2110         if (pktcnt > 0) {
2111                 m0 = packetlist;
2112         }
2113         m_freem_list(m0);
2114         goto done;
2115
2116 #undef ipsec_state
2117 #undef args
2118 #undef sro_fwd
2119 #undef saved_route
2120 #undef ipf_pktopts
2121 #undef IP_CHECK_RESTRICTIONS
2122 }
2123
2124 int
2125 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
2126 {
2127         struct ip *ip, *mhip;
2128         int len, hlen, mhlen, firstlen, off, error = 0;
2129         struct mbuf **mnext = &m->m_nextpkt, *m0;
2130         int nfrags = 1;
2131
2132         ip = mtod(m, struct ip *);
2133 #ifdef _IP_VHL
2134         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2135 #else /* !_IP_VHL */
2136         hlen = ip->ip_hl << 2;
2137 #endif /* !_IP_VHL */
2138
2139 #ifdef INET6
2140         /*
2141          * We need to adjust the fragment sizes to account
2142          * for IPv6 fragment header if it needs to be translated
2143          * from IPv4 to IPv6.
2144          */
2145         if (IS_INTF_CLAT46(ifp)) {
2146                 mtu -= sizeof(struct ip6_frag);
2147         }
2148
2149 #endif
2150         firstlen = len = (mtu - hlen) & ~7;
2151         if (len < 8) {
2152                 m_freem(m);
2153                 return EMSGSIZE;
2154         }
2155
2156         /*
2157          * if the interface will not calculate checksums on
2158          * fragmented packets, then do it here.
2159          */
2160         if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
2161             !(ifp->if_hwassist & CSUM_IP_FRAGS)) {
2162                 in_delayed_cksum(m);
2163         }
2164
2165         /*
2166          * Loop through length of segment after first fragment,
2167          * make new header and copy data of each part and link onto chain.
2168          */
2169         m0 = m;
2170         mhlen = sizeof(struct ip);
2171         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
2172                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2173                 if (m == NULL) {
2174                         error = ENOBUFS;
2175                         OSAddAtomic(1, &ipstat.ips_odropped);
2176                         goto sendorfree;
2177                 }
2178                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
2179                 m->m_data += max_linkhdr;
2180                 mhip = mtod(m, struct ip *);
2181                 *mhip = *ip;
2182                 if (hlen > sizeof(struct ip)) {
2183                         mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
2184                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
2185                 }
2186                 m->m_len = mhlen;
2187                 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
2188                 if (ip->ip_off & IP_MF) {
2189                         mhip->ip_off |= IP_MF;
2190                 }
2191                 if (off + len >= (u_short)ip->ip_len) {
2192                         len = (u_short)ip->ip_len - off;
2193                 } else {
2194                         mhip->ip_off |= IP_MF;
2195                 }
2196                 mhip->ip_len = htons((u_short)(len + mhlen));
2197                 m->m_next = m_copy(m0, off, len);
2198                 if (m->m_next == NULL) {
2199                         (void) m_free(m);
2200                         error = ENOBUFS;        /* ??? */
2201                         OSAddAtomic(1, &ipstat.ips_odropped);
2202                         goto sendorfree;
2203                 }
2204                 m->m_pkthdr.len = mhlen + len;
2205                 m->m_pkthdr.rcvif = NULL;
2206                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
2207
2208                 M_COPY_CLASSIFIER(m, m0);
2209                 M_COPY_PFTAG(m, m0);
2210
2211 #if CONFIG_MACF_NET
2212                 mac_netinet_fragment(m0, m);
2213 #endif /* CONFIG_MACF_NET */
2214
2215 #if BYTE_ORDER != BIG_ENDIAN
2216                 HTONS(mhip->ip_off);
2217 #endif
2218
2219                 mhip->ip_sum = 0;
2220                 if (sw_csum & CSUM_DELAY_IP) {
2221                         mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
2222                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2223                 }
2224                 *mnext = m;
2225                 mnext = &m->m_nextpkt;
2226                 nfrags++;
2227         }
2228         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
2229
2230         /* set first/last markers for fragment chain */
2231         m->m_flags |= M_LASTFRAG;
2232         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
2233         m0->m_pkthdr.csum_data = nfrags;
2234
2235         /*
2236          * Update first fragment by trimming what's been copied out
2237          * and updating header, then send each fragment (in order).
2238          */
2239         m = m0;
2240         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
2241         m->m_pkthdr.len = hlen + firstlen;
2242         ip->ip_len = htons((u_short)m->m_pkthdr.len);
2243         ip->ip_off |= IP_MF;
2244
2245 #if BYTE_ORDER != BIG_ENDIAN
2246         HTONS(ip->ip_off);
2247 #endif
2248
2249         ip->ip_sum = 0;
2250         if (sw_csum & CSUM_DELAY_IP) {
2251                 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
2252                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2253         }
2254 sendorfree:
2255         if (error) {
2256                 m_freem_list(m0);
2257         }
2258
2259         return error;
2260 }
2261
2262 static void
2263 ip_out_cksum_stats(int proto, u_int32_t len)
2264 {
2265         switch (proto) {
2266         case IPPROTO_TCP:
2267                 tcp_out_cksum_stats(len);
2268                 break;
2269         case IPPROTO_UDP:
2270                 udp_out_cksum_stats(len);
2271                 break;
2272         default:
2273                 /* keep only TCP or UDP stats for now */
2274                 break;
2275         }
2276 }
2277
2278 /*
2279  * Process a delayed payload checksum calculation (outbound path.)
2280  *
2281  * hoff is the number of bytes beyond the mbuf data pointer which
2282  * points to the IP header.
2283  *
2284  * Returns a bitmask representing all the work done in software.
2285  */
2286 uint32_t
2287 in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
2288 {
2289         unsigned char buf[15 << 2] __attribute__((aligned(8)));
2290         struct ip *ip;
2291         uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
2292         uint16_t csum, ip_len;
2293
2294         _CASSERT(sizeof(csum) == sizeof(uint16_t));
2295         VERIFY(m->m_flags & M_PKTHDR);
2296
2297         sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
2298
2299         if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) {
2300                 goto done;
2301         }
2302
2303         mlen = m->m_pkthdr.len;                         /* total mbuf len */
2304
2305         /* sanity check (need at least simple IP header) */
2306         if (mlen < (hoff + sizeof(*ip))) {
2307                 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
2308                     "(%u+%u)\n", __func__, m, mlen, hoff,
2309                     (uint32_t)sizeof(*ip));
2310                 /* NOTREACHED */
2311         }
2312
2313         /*
2314          * In case the IP header is not contiguous, or not 32-bit aligned,
2315          * or if we're computing the IP header checksum, copy it to a local
2316          * buffer.  Copy only the simple IP header here (IP options case
2317          * is handled below.)
2318          */
2319         if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof(*ip)) > m->m_len ||
2320             !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2321                 m_copydata(m, hoff, sizeof(*ip), (caddr_t)buf);
2322                 ip = (struct ip *)(void *)buf;
2323                 _hlen = sizeof(*ip);
2324         } else {
2325                 ip = (struct ip *)(void *)(m->m_data + hoff);
2326                 _hlen = 0;
2327         }
2328
2329         hlen = IP_VHL_HL(ip->ip_vhl) << 2;              /* IP header len */
2330
2331         /* sanity check */
2332         if (mlen < (hoff + hlen)) {
2333                 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2334                     "hoff %u", __func__, m, mlen, hlen, hoff);
2335                 /* NOTREACHED */
2336         }
2337
2338         /*
2339          * We could be in the context of an IP or interface filter; in the
2340          * former case, ip_len would be in host (correct) order while for
2341          * the latter it would be in network order.  Because of this, we
2342          * attempt to interpret the length field by comparing it against
2343          * the actual packet length.  If the comparison fails, byte swap
2344          * the length and check again.  If it still fails, use the actual
2345          * packet length.  This also covers the trailing bytes case.
2346          */
2347         ip_len = ip->ip_len;
2348         if (ip_len != (mlen - hoff)) {
2349                 ip_len = OSSwapInt16(ip_len);
2350                 if (ip_len != (mlen - hoff)) {
2351                         printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2352                             "[swapped %d (%x)] doesn't match actual packet "
2353                             "length; %d is used instead\n", __func__,
2354                             (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p,
2355                             ip->ip_len, ip->ip_len, ip_len, ip_len,
2356                             (mlen - hoff));
2357                         ip_len = mlen - hoff;
2358                 }
2359         }
2360
2361         len = ip_len - hlen;                            /* csum span */
2362
2363         if (sw_csum & CSUM_DELAY_DATA) {
2364                 uint16_t ulpoff;
2365
2366                 /*
2367                  * offset is added to the lower 16-bit value of csum_data,
2368                  * which is expected to contain the ULP offset; therefore
2369                  * CSUM_PARTIAL offset adjustment must be undone.
2370                  */
2371                 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL | CSUM_DATA_VALID)) ==
2372                     (CSUM_PARTIAL | CSUM_DATA_VALID)) {
2373                         /*
2374                          * Get back the original ULP offset (this will
2375                          * undo the CSUM_PARTIAL logic in ip_output.)
2376                          */
2377                         m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2378                             m->m_pkthdr.csum_tx_start);
2379                 }
2380
2381                 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2382                 offset = hoff + hlen;                   /* ULP header */
2383
2384                 if (mlen < (ulpoff + sizeof(csum))) {
2385                         panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2386                             "cksum offset (%u) cksum flags 0x%x\n", __func__,
2387                             m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2388                         /* NOTREACHED */
2389                 }
2390
2391                 csum = inet_cksum(m, 0, offset, len);
2392
2393                 /* Update stats */
2394                 ip_out_cksum_stats(ip->ip_p, len);
2395
2396                 /* RFC1122 4.1.3.4 */
2397                 if (csum == 0 &&
2398                     (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_ZERO_INVERT))) {
2399                         csum = 0xffff;
2400                 }
2401
2402                 /* Insert the checksum in the ULP csum field */
2403                 offset += ulpoff;
2404                 if (offset + sizeof(csum) > m->m_len) {
2405                         m_copyback(m, offset, sizeof(csum), &csum);
2406                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2407                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2408                 } else {
2409                         bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2410                 }
2411                 m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2412                     CSUM_PARTIAL | CSUM_ZERO_INVERT);
2413         }
2414
2415         if (sw_csum & CSUM_DELAY_IP) {
2416                 /* IP header must be in the local buffer */
2417                 VERIFY(_hlen == sizeof(*ip));
2418                 if (_hlen != hlen) {
2419                         VERIFY(hlen <= sizeof(buf));
2420                         m_copydata(m, hoff, hlen, (caddr_t)buf);
2421                         ip = (struct ip *)(void *)buf;
2422                         _hlen = hlen;
2423                 }
2424
2425                 /*
2426                  * Compute the IP header checksum as if the IP length
2427                  * is the length which we believe is "correct"; see
2428                  * how ip_len gets calculated above.  Note that this
2429                  * is done on the local copy and not on the real one.
2430                  */
2431                 ip->ip_len = htons(ip_len);
2432                 ip->ip_sum = 0;
2433                 csum = in_cksum_hdr_opt(ip);
2434
2435                 /* Update stats */
2436                 ipstat.ips_snd_swcsum++;
2437                 ipstat.ips_snd_swcsum_bytes += hlen;
2438
2439                 /*
2440                  * Insert only the checksum in the existing IP header
2441                  * csum field; all other fields are left unchanged.
2442                  */
2443                 offset = hoff + offsetof(struct ip, ip_sum);
2444                 if (offset + sizeof(csum) > m->m_len) {
2445                         m_copyback(m, offset, sizeof(csum), &csum);
2446                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2447                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2448                 } else {
2449                         bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2450                 }
2451                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2452         }
2453
2454 done:
2455         return sw_csum;
2456 }
2457
2458 /*
2459  * Insert IP options into preformed packet.
2460  * Adjust IP destination as required for IP source routing,
2461  * as indicated by a non-zero in_addr at the start of the options.
2462  *
2463  * XXX This routine assumes that the packet has no options in place.
2464  */
2465 static struct mbuf *
2466 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2467 {
2468         struct ipoption *p = mtod(opt, struct ipoption *);
2469         struct mbuf *n;
2470         struct ip *ip = mtod(m, struct ip *);
2471         unsigned optlen;
2472
2473         optlen = opt->m_len - sizeof(p->ipopt_dst);
2474         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
2475                 return m;             /* XXX should fail */
2476         }
2477         if (p->ipopt_dst.s_addr) {
2478                 ip->ip_dst = p->ipopt_dst;
2479         }
2480         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2481                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2482                 if (n == NULL) {
2483                         return m;
2484                 }
2485                 n->m_pkthdr.rcvif = 0;
2486 #if CONFIG_MACF_NET
2487                 mac_mbuf_label_copy(m, n);
2488 #endif /* CONFIG_MACF_NET */
2489                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2490                 m->m_len -= sizeof(struct ip);
2491                 m->m_data += sizeof(struct ip);
2492                 n->m_next = m;
2493                 m = n;
2494                 m->m_len = optlen + sizeof(struct ip);
2495                 m->m_data += max_linkhdr;
2496                 (void) memcpy(mtod(m, void *), ip, sizeof(struct ip));
2497         } else {
2498                 m->m_data -= optlen;
2499                 m->m_len += optlen;
2500                 m->m_pkthdr.len += optlen;
2501                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2502         }
2503         ip = mtod(m, struct ip *);
2504         bcopy(p->ipopt_list, ip + 1, optlen);
2505         *phlen = sizeof(struct ip) + optlen;
2506         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2507         ip->ip_len += optlen;
2508         return m;
2509 }
2510
2511 /*
2512  * Copy options from ip to jp,
2513  * omitting those not copied during fragmentation.
2514  */
2515 static int
2516 ip_optcopy(struct ip *ip, struct ip *jp)
2517 {
2518         u_char *cp, *dp;
2519         int opt, optlen, cnt;
2520
2521         cp = (u_char *)(ip + 1);
2522         dp = (u_char *)(jp + 1);
2523         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2524         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2525                 opt = cp[0];
2526                 if (opt == IPOPT_EOL) {
2527                         break;
2528                 }
2529                 if (opt == IPOPT_NOP) {
2530                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2531                         *dp++ = IPOPT_NOP;
2532                         optlen = 1;
2533                         continue;
2534                 }
2535 #if DIAGNOSTIC
2536                 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2537                         panic("malformed IPv4 option passed to ip_optcopy");
2538                         /* NOTREACHED */
2539                 }
2540 #endif
2541                 optlen = cp[IPOPT_OLEN];
2542 #if DIAGNOSTIC
2543                 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2544                         panic("malformed IPv4 option passed to ip_optcopy");
2545                         /* NOTREACHED */
2546                 }
2547 #endif
2548                 /* bogus lengths should have been caught by ip_dooptions */
2549                 if (optlen > cnt) {
2550                         optlen = cnt;
2551                 }
2552                 if (IPOPT_COPIED(opt)) {
2553                         bcopy(cp, dp, optlen);
2554                         dp += optlen;
2555                 }
2556         }
2557         for (optlen = dp - (u_char *)(jp + 1); optlen & 0x3; optlen++) {
2558                 *dp++ = IPOPT_EOL;
2559         }
2560         return optlen;
2561 }
2562
2563 /*
2564  * IP socket option processing.
2565  */
2566 int
2567 ip_ctloutput(struct socket *so, struct sockopt *sopt)
2568 {
2569         struct  inpcb *inp = sotoinpcb(so);
2570         int     error, optval;
2571         lck_mtx_t *mutex_held = NULL;
2572
2573         error = optval = 0;
2574         if (sopt->sopt_level != IPPROTO_IP) {
2575                 return EINVAL;
2576         }
2577
2578         switch (sopt->sopt_dir) {
2579         case SOPT_SET:
2580                 mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
2581                 /*
2582                  *  Wait if we are in the middle of ip_output
2583                  *  as we unlocked the socket there and don't
2584                  *  want to overwrite the IP options
2585                  */
2586                 if (inp->inp_sndinprog_cnt > 0) {
2587                         inp->inp_sndingprog_waiters++;
2588
2589                         while (inp->inp_sndinprog_cnt > 0) {
2590                                 msleep(&inp->inp_sndinprog_cnt, mutex_held,
2591                                     PSOCK | PCATCH, "inp_sndinprog_cnt", NULL);
2592                         }
2593                         inp->inp_sndingprog_waiters--;
2594                 }
2595                 switch (sopt->sopt_name) {
2596 #ifdef notyet
2597                 case IP_RETOPTS:
2598 #endif
2599                 case IP_OPTIONS: {
2600                         struct mbuf *m;
2601
2602                         if (sopt->sopt_valsize > MLEN) {
2603                                 error = EMSGSIZE;
2604                                 break;
2605                         }
2606                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2607                             MT_HEADER);
2608                         if (m == NULL) {
2609                                 error = ENOBUFS;
2610                                 break;
2611                         }
2612                         m->m_len = sopt->sopt_valsize;
2613                         error = sooptcopyin(sopt, mtod(m, char *),
2614                             m->m_len, m->m_len);
2615                         if (error) {
2616                                 m_freem(m);
2617                                 break;
2618                         }
2619
2620                         return ip_pcbopts(sopt->sopt_name,
2621                                    &inp->inp_options, m);
2622                 }
2623
2624                 case IP_TOS:
2625                 case IP_TTL:
2626                 case IP_RECVOPTS:
2627                 case IP_RECVRETOPTS:
2628                 case IP_RECVDSTADDR:
2629                 case IP_RECVIF:
2630                 case IP_RECVTTL:
2631                 case IP_RECVPKTINFO:
2632                 case IP_RECVTOS:
2633                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2634                             sizeof(optval));
2635                         if (error) {
2636                                 break;
2637                         }
2638
2639                         switch (sopt->sopt_name) {
2640                         case IP_TOS:
2641                                 inp->inp_ip_tos = optval;
2642                                 break;
2643
2644                         case IP_TTL:
2645                                 inp->inp_ip_ttl = optval;
2646                                 break;
2647 #define OPTSET(bit) \
2648         if (optval) \
2649                 inp->inp_flags |= bit; \
2650         else \
2651                 inp->inp_flags &= ~bit;
2652
2653                         case IP_RECVOPTS:
2654                                 OPTSET(INP_RECVOPTS);
2655                                 break;
2656
2657                         case IP_RECVRETOPTS:
2658                                 OPTSET(INP_RECVRETOPTS);
2659                                 break;
2660
2661                         case IP_RECVDSTADDR:
2662                                 OPTSET(INP_RECVDSTADDR);
2663                                 break;
2664
2665                         case IP_RECVIF:
2666                                 OPTSET(INP_RECVIF);
2667                                 break;
2668
2669                         case IP_RECVTTL:
2670                                 OPTSET(INP_RECVTTL);
2671                                 break;
2672
2673                         case IP_RECVPKTINFO:
2674                                 OPTSET(INP_PKTINFO);
2675                                 break;
2676
2677                         case IP_RECVTOS:
2678                                 OPTSET(INP_RECVTOS);
2679                                 break;
2680  #undef OPTSET
2681                         }
2682                         break;
2683                 /*
2684                  * Multicast socket options are processed by the in_mcast
2685                  * module.
2686                  */
2687                 case IP_MULTICAST_IF:
2688                 case IP_MULTICAST_IFINDEX:
2689                 case IP_MULTICAST_VIF:
2690                 case IP_MULTICAST_TTL:
2691                 case IP_MULTICAST_LOOP:
2692                 case IP_ADD_MEMBERSHIP:
2693                 case IP_DROP_MEMBERSHIP:
2694                 case IP_ADD_SOURCE_MEMBERSHIP:
2695                 case IP_DROP_SOURCE_MEMBERSHIP:
2696                 case IP_BLOCK_SOURCE:
2697                 case IP_UNBLOCK_SOURCE:
2698                 case IP_MSFILTER:
2699                 case MCAST_JOIN_GROUP:
2700                 case MCAST_LEAVE_GROUP:
2701                 case MCAST_JOIN_SOURCE_GROUP:
2702                 case MCAST_LEAVE_SOURCE_GROUP:
2703                 case MCAST_BLOCK_SOURCE:
2704                 case MCAST_UNBLOCK_SOURCE:
2705                         error = inp_setmoptions(inp, sopt);
2706                         break;
2707
2708                 case IP_PORTRANGE:
2709                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2710                             sizeof(optval));
2711                         if (error) {
2712                                 break;
2713                         }
2714
2715                         switch (optval) {
2716                         case IP_PORTRANGE_DEFAULT:
2717                                 inp->inp_flags &= ~(INP_LOWPORT);
2718                                 inp->inp_flags &= ~(INP_HIGHPORT);
2719                                 break;
2720
2721                         case IP_PORTRANGE_HIGH:
2722                                 inp->inp_flags &= ~(INP_LOWPORT);
2723                                 inp->inp_flags |= INP_HIGHPORT;
2724                                 break;
2725
2726                         case IP_PORTRANGE_LOW:
2727                                 inp->inp_flags &= ~(INP_HIGHPORT);
2728                                 inp->inp_flags |= INP_LOWPORT;
2729                                 break;
2730
2731                         default:
2732                                 error = EINVAL;
2733                                 break;
2734                         }
2735                         break;
2736
2737 #if IPSEC
2738                 case IP_IPSEC_POLICY: {
2739                         caddr_t req = NULL;
2740                         size_t len = 0;
2741                         int priv;
2742                         struct mbuf *m;
2743                         int optname;
2744
2745                         if ((error = soopt_getm(sopt, &m)) != 0) { /* XXX */
2746                                 break;
2747                         }
2748                         if ((error = soopt_mcopyin(sopt, m)) != 0) { /* XXX */
2749                                 break;
2750                         }
2751                         priv = (proc_suser(sopt->sopt_p) == 0);
2752                         if (m) {
2753                                 req = mtod(m, caddr_t);
2754                                 len = m->m_len;
2755                         }
2756                         optname = sopt->sopt_name;
2757                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2758                         m_freem(m);
2759                         break;
2760                 }
2761 #endif /* IPSEC */
2762
2763 #if TRAFFIC_MGT
2764                 case IP_TRAFFIC_MGT_BACKGROUND: {
2765                         unsigned background = 0;
2766
2767                         error = sooptcopyin(sopt, &background,
2768                             sizeof(background), sizeof(background));
2769                         if (error) {
2770                                 break;
2771                         }
2772
2773                         if (background) {
2774                                 socket_set_traffic_mgt_flags_locked(so,
2775                                     TRAFFIC_MGT_SO_BACKGROUND);
2776                         } else {
2777                                 socket_clear_traffic_mgt_flags_locked(so,
2778                                     TRAFFIC_MGT_SO_BACKGROUND);
2779                         }
2780
2781                         break;
2782                 }
2783 #endif /* TRAFFIC_MGT */
2784
2785                 /*
2786                  * On a multihomed system, scoped routing can be used to
2787                  * restrict the source interface used for sending packets.
2788                  * The socket option IP_BOUND_IF binds a particular AF_INET
2789                  * socket to an interface such that data sent on the socket
2790                  * is restricted to that interface.  This is unlike the
2791                  * SO_DONTROUTE option where the routing table is bypassed;
2792                  * therefore it allows for a greater flexibility and control
2793                  * over the system behavior, and does not place any restriction
2794                  * on the destination address type (e.g.  unicast, multicast,
2795                  * or broadcast if applicable) or whether or not the host is
2796                  * directly reachable.  Note that in the multicast transmit
2797                  * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2798                  * IP_BOUND_IF, since the former practically bypasses the
2799                  * routing table; in this case, IP_BOUND_IF sets the default
2800                  * interface used for sending multicast packets in the absence
2801                  * of an explicit multicast transmit interface.
2802                  */
2803                 case IP_BOUND_IF:
2804                         /* This option is settable only for IPv4 */
2805                         if (!(inp->inp_vflag & INP_IPV4)) {
2806                                 error = EINVAL;
2807                                 break;
2808                         }
2809
2810                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2811                             sizeof(optval));
2812
2813                         if (error) {
2814                                 break;
2815                         }
2816
2817                         error = inp_bindif(inp, optval, NULL);
2818                         break;
2819
2820                 case IP_NO_IFT_CELLULAR:
2821                         /* This option is settable only for IPv4 */
2822                         if (!(inp->inp_vflag & INP_IPV4)) {
2823                                 error = EINVAL;
2824                                 break;
2825                         }
2826
2827                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2828                             sizeof(optval));
2829
2830                         if (error) {
2831                                 break;
2832                         }
2833
2834                         /* once set, it cannot be unset */
2835                         if (!optval && INP_NO_CELLULAR(inp)) {
2836                                 error = EINVAL;
2837                                 break;
2838                         }
2839
2840                         error = so_set_restrictions(so,
2841                             SO_RESTRICT_DENY_CELLULAR);
2842                         break;
2843
2844                 case IP_OUT_IF:
2845                         /* This option is not settable */
2846                         error = EINVAL;
2847                         break;
2848
2849                 default:
2850                         error = ENOPROTOOPT;
2851                         break;
2852                 }
2853                 break;
2854
2855         case SOPT_GET:
2856                 switch (sopt->sopt_name) {
2857                 case IP_OPTIONS:
2858                 case IP_RETOPTS:
2859                         if (inp->inp_options) {
2860                                 error = sooptcopyout(sopt,
2861                                     mtod(inp->inp_options, char *),
2862                                     inp->inp_options->m_len);
2863                         } else {
2864                                 sopt->sopt_valsize = 0;
2865                         }
2866                         break;
2867
2868                 case IP_TOS:
2869                 case IP_TTL:
2870                 case IP_RECVOPTS:
2871                 case IP_RECVRETOPTS:
2872                 case IP_RECVDSTADDR:
2873                 case IP_RECVIF:
2874                 case IP_RECVTTL:
2875                 case IP_PORTRANGE:
2876                 case IP_RECVPKTINFO:
2877                 case IP_RECVTOS:
2878                         switch (sopt->sopt_name) {
2879                         case IP_TOS:
2880                                 optval = inp->inp_ip_tos;
2881                                 break;
2882
2883                         case IP_TTL:
2884                                 optval = inp->inp_ip_ttl;
2885                                 break;
2886
2887 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2888
2889                         case IP_RECVOPTS:
2890                                 optval = OPTBIT(INP_RECVOPTS);
2891                                 break;
2892
2893                         case IP_RECVRETOPTS:
2894                                 optval = OPTBIT(INP_RECVRETOPTS);
2895                                 break;
2896
2897                         case IP_RECVDSTADDR:
2898                                 optval = OPTBIT(INP_RECVDSTADDR);
2899                                 break;
2900
2901                         case IP_RECVIF:
2902                                 optval = OPTBIT(INP_RECVIF);
2903                                 break;
2904
2905                         case IP_RECVTTL:
2906                                 optval = OPTBIT(INP_RECVTTL);
2907                                 break;
2908
2909                         case IP_PORTRANGE:
2910                                 if (inp->inp_flags & INP_HIGHPORT) {
2911                                         optval = IP_PORTRANGE_HIGH;
2912                                 } else if (inp->inp_flags & INP_LOWPORT) {
2913                                         optval = IP_PORTRANGE_LOW;
2914                                 } else {
2915                                         optval = 0;
2916                                 }
2917                                 break;
2918
2919                         case IP_RECVPKTINFO:
2920                                 optval = OPTBIT(INP_PKTINFO);
2921                                 break;
2922
2923                         case IP_RECVTOS:
2924                                 optval = OPTBIT(INP_RECVTOS);
2925                                 break;
2926                         }
2927                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2928                         break;
2929
2930                 case IP_MULTICAST_IF:
2931                 case IP_MULTICAST_IFINDEX:
2932                 case IP_MULTICAST_VIF:
2933                 case IP_MULTICAST_TTL:
2934                 case IP_MULTICAST_LOOP:
2935                 case IP_MSFILTER:
2936                         error = inp_getmoptions(inp, sopt);
2937                         break;
2938
2939 #if IPSEC
2940                 case IP_IPSEC_POLICY: {
2941                         error = 0; /* This option is no longer supported */
2942                         break;
2943                 }
2944 #endif /* IPSEC */
2945
2946 #if TRAFFIC_MGT
2947                 case IP_TRAFFIC_MGT_BACKGROUND: {
2948                         unsigned background = (so->so_flags1 &
2949                             SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2950                         return sooptcopyout(sopt, &background,
2951                                    sizeof(background));
2952                 }
2953 #endif /* TRAFFIC_MGT */
2954
2955                 case IP_BOUND_IF:
2956                         if (inp->inp_flags & INP_BOUND_IF) {
2957                                 optval = inp->inp_boundifp->if_index;
2958                         }
2959                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2960                         break;
2961
2962                 case IP_NO_IFT_CELLULAR:
2963                         optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2964                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2965                         break;
2966
2967                 case IP_OUT_IF:
2968                         optval = (inp->inp_last_outifp != NULL) ?
2969                             inp->inp_last_outifp->if_index : 0;
2970                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2971                         break;
2972
2973                 default:
2974                         error = ENOPROTOOPT;
2975                         break;
2976                 }
2977                 break;
2978         }
2979         return error;
2980 }
2981
2982 /*
2983  * Set up IP options in pcb for insertion in output packets.
2984  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2985  * with destination address if source routed.
2986  */
2987 static int
2988 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2989 {
2990 #pragma unused(optname)
2991         int cnt, optlen;
2992         u_char *cp;
2993         u_char opt;
2994
2995         /* turn off any old options */
2996         if (*pcbopt) {
2997                 (void) m_free(*pcbopt);
2998         }
2999         *pcbopt = 0;
3000         if (m == (struct mbuf *)0 || m->m_len == 0) {
3001                 /*
3002                  * Only turning off any previous options.
3003                  */
3004                 if (m) {
3005                         (void) m_free(m);
3006                 }
3007                 return 0;
3008         }
3009
3010         if (m->m_len % sizeof(int32_t)) {
3011                 goto bad;
3012         }
3013
3014         /*
3015          * IP first-hop destination address will be stored before
3016          * actual options; move other options back
3017          * and clear it when none present.
3018          */
3019         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) {
3020                 goto bad;
3021         }
3022         cnt = m->m_len;
3023         m->m_len += sizeof(struct in_addr);
3024         cp = mtod(m, u_char *) + sizeof(struct in_addr);
3025         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
3026         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
3027
3028         for (; cnt > 0; cnt -= optlen, cp += optlen) {
3029                 opt = cp[IPOPT_OPTVAL];
3030                 if (opt == IPOPT_EOL) {
3031                         break;
3032                 }
3033                 if (opt == IPOPT_NOP) {
3034                         optlen = 1;
3035                 } else {
3036                         if (cnt < IPOPT_OLEN + sizeof(*cp)) {
3037                                 goto bad;
3038                         }
3039                         optlen = cp[IPOPT_OLEN];
3040                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
3041                                 goto bad;
3042                         }
3043                 }
3044                 switch (opt) {
3045                 default:
3046                         break;
3047
3048                 case IPOPT_LSRR:
3049                 case IPOPT_SSRR:
3050                         /*
3051                          * user process specifies route as:
3052                          *      ->A->B->C->D
3053                          * D must be our final destination (but we can't
3054                          * check that since we may not have connected yet).
3055                          * A is first hop destination, which doesn't appear in
3056                          * actual IP option, but is stored before the options.
3057                          */
3058                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) {
3059                                 goto bad;
3060                         }
3061                         m->m_len -= sizeof(struct in_addr);
3062                         cnt -= sizeof(struct in_addr);
3063                         optlen -= sizeof(struct in_addr);
3064                         cp[IPOPT_OLEN] = optlen;
3065                         /*
3066                          * Move first hop before start of options.
3067                          */
3068                         bcopy((caddr_t)&cp[IPOPT_OFFSET + 1], mtod(m, caddr_t),
3069                             sizeof(struct in_addr));
3070                         /*
3071                          * Then copy rest of options back
3072                          * to close up the deleted entry.
3073                          */
3074                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET + 1] +
3075                             sizeof(struct in_addr)),
3076                             (caddr_t)&cp[IPOPT_OFFSET + 1],
3077                             (unsigned)cnt - (IPOPT_MINOFF - 1));
3078                         break;
3079                 }
3080         }
3081         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) {
3082                 goto bad;
3083         }
3084         *pcbopt = m;
3085         return 0;
3086
3087 bad:
3088         (void) m_free(m);
3089         return EINVAL;
3090 }
3091
3092 void
3093 ip_moptions_init(void)
3094 {
3095         PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof(imo_debug));
3096
3097         imo_size = (imo_debug == 0) ? sizeof(struct ip_moptions) :
3098             sizeof(struct ip_moptions_dbg);
3099
3100         imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
3101             IMO_ZONE_NAME);
3102         if (imo_zone == NULL) {
3103                 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
3104                 /* NOTREACHED */
3105         }
3106         zone_change(imo_zone, Z_EXPAND, TRUE);
3107 }
3108
3109 void
3110 imo_addref(struct ip_moptions *imo, int locked)
3111 {
3112         if (!locked) {
3113                 IMO_LOCK(imo);
3114         } else {
3115                 IMO_LOCK_ASSERT_HELD(imo);
3116         }
3117
3118         if (++imo->imo_refcnt == 0) {
3119                 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
3120                 /* NOTREACHED */
3121         } else if (imo->imo_trace != NULL) {
3122                 (*imo->imo_trace)(imo, TRUE);
3123         }
3124
3125         if (!locked) {
3126                 IMO_UNLOCK(imo);
3127         }
3128 }
3129
3130 void
3131 imo_remref(struct ip_moptions *imo)
3132 {
3133         int i;
3134
3135         IMO_LOCK(imo);
3136         if (imo->imo_refcnt == 0) {
3137                 panic("%s: imo %p negative refcnt", __func__, imo);
3138                 /* NOTREACHED */
3139         } else if (imo->imo_trace != NULL) {
3140                 (*imo->imo_trace)(imo, FALSE);
3141         }
3142
3143         --imo->imo_refcnt;
3144         if (imo->imo_refcnt > 0) {
3145                 IMO_UNLOCK(imo);
3146                 return;
3147         }
3148
3149         for (i = 0; i < imo->imo_num_memberships; ++i) {
3150                 struct in_mfilter *imf;
3151
3152                 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
3153                 if (imf != NULL) {
3154                         imf_leave(imf);
3155                 }
3156
3157                 (void) in_leavegroup(imo->imo_membership[i], imf);
3158
3159                 if (imf != NULL) {
3160                         imf_purge(imf);
3161                 }
3162
3163                 INM_REMREF(imo->imo_membership[i]);
3164                 imo->imo_membership[i] = NULL;
3165         }
3166         imo->imo_num_memberships = 0;
3167         if (imo->imo_mfilters != NULL) {
3168                 FREE(imo->imo_mfilters, M_INMFILTER);
3169                 imo->imo_mfilters = NULL;
3170         }
3171         if (imo->imo_membership != NULL) {
3172                 FREE(imo->imo_membership, M_IPMOPTS);
3173                 imo->imo_membership = NULL;
3174         }
3175         IMO_UNLOCK(imo);
3176
3177         lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
3178
3179         if (!(imo->imo_debug & IFD_ALLOC)) {
3180                 panic("%s: imo %p cannot be freed", __func__, imo);
3181                 /* NOTREACHED */
3182         }
3183         zfree(imo_zone, imo);
3184 }
3185
3186 static void
3187 imo_trace(struct ip_moptions *imo, int refhold)
3188 {
3189         struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
3190         ctrace_t *tr;
3191         u_int32_t idx;
3192         u_int16_t *cnt;
3193
3194         if (!(imo->imo_debug & IFD_DEBUG)) {
3195                 panic("%s: imo %p has no debug structure", __func__, imo);
3196                 /* NOTREACHED */
3197         }
3198         if (refhold) {
3199                 cnt = &imo_dbg->imo_refhold_cnt;
3200                 tr = imo_dbg->imo_refhold;
3201         } else {
3202                 cnt = &imo_dbg->imo_refrele_cnt;
3203                 tr = imo_dbg->imo_refrele;
3204         }
3205
3206         idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
3207         ctrace_record(&tr[idx]);
3208 }
3209
3210 struct ip_moptions *
3211 ip_allocmoptions(int how)
3212 {
3213         struct ip_moptions *imo;
3214
3215         imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
3216         if (imo != NULL) {
3217                 bzero(imo, imo_size);
3218                 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
3219                 imo->imo_debug |= IFD_ALLOC;
3220                 if (imo_debug != 0) {
3221                         imo->imo_debug |= IFD_DEBUG;
3222                         imo->imo_trace = imo_trace;
3223                 }
3224                 IMO_ADDREF(imo);
3225         }
3226
3227         return imo;
3228 }
3229
3230 /*
3231  * Routine called from ip_output() to loop back a copy of an IP multicast
3232  * packet to the input queue of a specified interface.  Note that this
3233  * calls the output routine of the loopback "driver", but with an interface
3234  * pointer that might NOT be a loopback interface -- evil, but easier than
3235  * replicating that code here.
3236  */
3237 static void
3238 ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
3239     struct sockaddr_in *dst, int hlen)
3240 {
3241         struct mbuf *copym;
3242         struct ip *ip;
3243
3244         if (lo_ifp == NULL) {
3245                 return;
3246         }
3247
3248         /*
3249          * Copy the packet header as it's needed for the checksum
3250          * Make sure to deep-copy IP header portion in case the data
3251          * is in an mbuf cluster, so that we can safely override the IP
3252          * header portion later.
3253          */
3254         copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR);
3255         if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) {
3256                 copym = m_pullup(copym, hlen);
3257         }
3258
3259         if (copym == NULL) {
3260                 return;
3261         }
3262
3263         /*
3264          * We don't bother to fragment if the IP length is greater
3265          * than the interface's MTU.  Can this possibly matter?
3266          */
3267         ip = mtod(copym, struct ip *);
3268 #if BYTE_ORDER != BIG_ENDIAN
3269         HTONS(ip->ip_len);
3270         HTONS(ip->ip_off);
3271 #endif
3272         ip->ip_sum = 0;
3273         ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
3274
3275         /*
3276          * Mark checksum as valid unless receive checksum offload is
3277          * disabled; if so, compute checksum in software.  If the
3278          * interface itself is lo0, this will be overridden by if_loop.
3279          */
3280         if (hwcksum_rx) {
3281                 copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL | CSUM_ZERO_INVERT);
3282                 copym->m_pkthdr.csum_flags |=
3283                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
3284                 copym->m_pkthdr.csum_data = 0xffff;
3285         } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
3286 #if BYTE_ORDER != BIG_ENDIAN
3287                 NTOHS(ip->ip_len);
3288 #endif
3289                 in_delayed_cksum(copym);
3290 #if BYTE_ORDER != BIG_ENDIAN
3291                 HTONS(ip->ip_len);
3292 #endif
3293         }
3294
3295         /*
3296          * Stuff the 'real' ifp into the pkthdr, to be used in matching
3297          * in ip_input(); we need the loopback ifp/dl_tag passed as args
3298          * to make the loopback driver compliant with the data link
3299          * requirements.
3300          */
3301         copym->m_pkthdr.rcvif = origifp;
3302
3303         /*
3304          * Also record the source interface (which owns the source address).
3305          * This is basically a stripped down version of ifa_foraddr().
3306          */
3307         if (srcifp == NULL) {
3308                 struct in_ifaddr *ia;
3309
3310                 lck_rw_lock_shared(in_ifaddr_rwlock);
3311                 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3312                         IFA_LOCK_SPIN(&ia->ia_ifa);
3313                         if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3314                                 srcifp = ia->ia_ifp;
3315                                 IFA_UNLOCK(&ia->ia_ifa);
3316                                 break;
3317                         }
3318                         IFA_UNLOCK(&ia->ia_ifa);
3319                 }
3320                 lck_rw_done(in_ifaddr_rwlock);
3321         }
3322         if (srcifp != NULL) {
3323                 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3324         }
3325         ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3326
3327         dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3328 }
3329
3330 /*
3331  * Given a source IP address (and route, if available), determine the best
3332  * interface to send the packet from.  Checking for (and updating) the
3333  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3334  * without any locks based on the assumption that ip_output() is single-
3335  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3336  * performing output at the IP layer.
3337  *
3338  * This routine is analogous to in6_selectroute() for IPv6.
3339  */
3340 static struct ifaddr *
3341 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3342 {
3343         struct ifaddr *ifa = NULL;
3344         struct in_addr src = ip->ip_src;
3345         struct in_addr dst = ip->ip_dst;
3346         struct ifnet *rt_ifp;
3347         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3348
3349         VERIFY(src.s_addr != INADDR_ANY);
3350
3351         if (ip_select_srcif_debug) {
3352                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof(s_src));
3353                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof(s_dst));
3354         }
3355
3356         if (ro->ro_rt != NULL) {
3357                 RT_LOCK(ro->ro_rt);
3358         }
3359
3360         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3361
3362         /*
3363          * Given the source IP address, find a suitable source interface
3364          * to use for transmission; if the caller has specified a scope,
3365          * optimize the search by looking at the addresses only for that
3366          * interface.  This is still suboptimal, however, as we need to
3367          * traverse the per-interface list.
3368          */
3369         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3370                 unsigned int scope = ifscope;
3371
3372                 /*
3373                  * If no scope is specified and the route is stale (pointing
3374                  * to a defunct interface) use the current primary interface;
3375                  * this happens when switching between interfaces configured
3376                  * with the same IP address.  Otherwise pick up the scope
3377                  * information from the route; the ULP may have looked up a
3378                  * correct route and we just need to verify it here and mark
3379                  * it with the ROF_SRCIF_SELECTED flag below.
3380                  */
3381                 if (scope == IFSCOPE_NONE) {
3382                         scope = rt_ifp->if_index;
3383                         if (scope != get_primary_ifscope(AF_INET) &&
3384                             ROUTE_UNUSABLE(ro)) {
3385                                 scope = get_primary_ifscope(AF_INET);
3386                         }
3387                 }
3388
3389                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3390
3391                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3392                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3393                         /*
3394                          * If forwarding is enabled, and if the packet isn't
3395                          * TCP or UDP, check if the source address belongs
3396                          * to one of our own interfaces; if so, demote the
3397                          * interface scope and do a route lookup right below.
3398                          */
3399                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3400                         if (ifa != NULL) {
3401                                 IFA_REMREF(ifa);
3402                                 ifa = NULL;
3403                                 ifscope = IFSCOPE_NONE;
3404                         }
3405                 }
3406
3407                 if (ip_select_srcif_debug && ifa != NULL) {
3408                         if (ro->ro_rt != NULL) {
3409                                 printf("%s->%s ifscope %d->%d ifa_if %s "
3410                                     "ro_if %s\n", s_src, s_dst, ifscope,
3411                                     scope, if_name(ifa->ifa_ifp),
3412                                     if_name(rt_ifp));
3413                         } else {
3414                                 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3415                                     s_src, s_dst, ifscope, scope,
3416                                     if_name(ifa->ifa_ifp));
3417                         }
3418                 }
3419         }
3420
3421         /*
3422          * Slow path; search for an interface having the corresponding source
3423          * IP address if the scope was not specified by the caller, and:
3424          *
3425          *   1) There currently isn't any route, or,
3426          *   2) The interface used by the route does not own that source
3427          *      IP address; in this case, the route will get blown away
3428          *      and we'll do a more specific scoped search using the newly
3429          *      found interface.
3430          */
3431         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3432                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3433
3434                 /*
3435                  * If we have the IP address, but not the route, we don't
3436                  * really know whether or not it belongs to the correct
3437                  * interface (it could be shared across multiple interfaces.)
3438                  * The only way to find out is to do a route lookup.
3439                  */
3440                 if (ifa != NULL && ro->ro_rt == NULL) {
3441                         struct rtentry *rt;
3442                         struct sockaddr_in sin;
3443                         struct ifaddr *oifa = NULL;
3444
3445                         bzero(&sin, sizeof(sin));
3446                         sin.sin_family = AF_INET;
3447                         sin.sin_len = sizeof(sin);
3448                         sin.sin_addr = dst;
3449
3450                         lck_mtx_lock(rnh_lock);
3451                         if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3452                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3453                                 RT_LOCK(rt);
3454                                 /*
3455                                  * If the route uses a different interface,
3456                                  * use that one instead.  The IP address of
3457                                  * the ifaddr that we pick up here is not
3458                                  * relevant.
3459                                  */
3460                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3461                                         oifa = ifa;
3462                                         ifa = rt->rt_ifa;
3463                                         IFA_ADDREF(ifa);
3464                                         RT_UNLOCK(rt);
3465                                 } else {
3466                                         RT_UNLOCK(rt);
3467                                 }
3468                                 rtfree_locked(rt);
3469                         }
3470                         lck_mtx_unlock(rnh_lock);
3471
3472                         if (oifa != NULL) {
3473                                 struct ifaddr *iifa;
3474
3475                                 /*
3476                                  * See if the interface pointed to by the
3477                                  * route is configured with the source IP
3478                                  * address of the packet.
3479                                  */
3480                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3481                                         src.s_addr, ifa->ifa_ifp->if_index);
3482
3483                                 if (iifa != NULL) {
3484                                         /*
3485                                          * Found it; drop the original one
3486                                          * as well as the route interface
3487                                          * address, and use this instead.
3488                                          */
3489                                         IFA_REMREF(oifa);
3490                                         IFA_REMREF(ifa);
3491                                         ifa = iifa;
3492                                 } else if (!ipforwarding ||
3493                                     (rt->rt_flags & RTF_GATEWAY)) {
3494                                         /*
3495                                          * This interface doesn't have that
3496                                          * source IP address; drop the route
3497                                          * interface address and just use the
3498                                          * original one, and let the caller
3499                                          * do a scoped route lookup.
3500                                          */
3501                                         IFA_REMREF(ifa);
3502                                         ifa = oifa;
3503                                 } else {
3504                                         /*
3505                                          * Forwarding is enabled and the source
3506                                          * address belongs to one of our own
3507                                          * interfaces which isn't the outgoing
3508                                          * interface, and we have a route, and
3509                                          * the destination is on a network that
3510                                          * is directly attached (onlink); drop
3511                                          * the original one and use the route
3512                                          * interface address instead.
3513                                          */
3514                                         IFA_REMREF(oifa);
3515                                 }
3516                         }
3517                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3518                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3519                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3520                         /*
3521                          * Forwarding is enabled and the source address belongs
3522                          * to one of our own interfaces which isn't the same
3523                          * as the interface used by the known route; drop the
3524                          * original one and use the route interface address.
3525                          */
3526                         IFA_REMREF(ifa);
3527                         ifa = ro->ro_rt->rt_ifa;
3528                         IFA_ADDREF(ifa);
3529                 }
3530
3531                 if (ip_select_srcif_debug && ifa != NULL) {
3532                         printf("%s->%s ifscope %d ifa_if %s\n",
3533                             s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3534                 }
3535         }
3536
3537         if (ro->ro_rt != NULL) {
3538                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3539         }
3540         /*
3541          * If there is a non-loopback route with the wrong interface, or if
3542          * there is no interface configured with such an address, blow it
3543          * away.  Except for local/loopback, we look for one with a matching
3544          * interface scope/index.
3545          */
3546         if (ro->ro_rt != NULL &&
3547             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3548             !(ro->ro_rt->rt_flags & RTF_UP))) {
3549                 if (ip_select_srcif_debug) {
3550                         if (ifa != NULL) {
3551                                 printf("%s->%s ifscope %d ro_if %s != "
3552                                     "ifa_if %s (cached route cleared)\n",
3553                                     s_src, s_dst, ifscope, if_name(rt_ifp),
3554                                     if_name(ifa->ifa_ifp));
3555                         } else {
3556                                 printf("%s->%s ifscope %d ro_if %s "
3557                                     "(no ifa_if found)\n",
3558                                     s_src, s_dst, ifscope, if_name(rt_ifp));
3559                         }
3560                 }
3561
3562                 RT_UNLOCK(ro->ro_rt);
3563                 ROUTE_RELEASE(ro);
3564
3565                 /*
3566                  * If the destination is IPv4 LLA and the route's interface
3567                  * doesn't match the source interface, then the source IP
3568                  * address is wrong; it most likely belongs to the primary
3569                  * interface associated with the IPv4 LL subnet.  Drop the
3570                  * packet rather than letting it go out and return an error
3571                  * to the ULP.  This actually applies not only to IPv4 LL
3572                  * but other shared subnets; for now we explicitly test only
3573                  * for the former case and save the latter for future.
3574                  */
3575                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3576                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3577                         IFA_REMREF(ifa);
3578                         ifa = NULL;
3579                 }
3580         }
3581
3582         if (ip_select_srcif_debug && ifa == NULL) {
3583                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3584                     s_src, s_dst, ifscope);
3585         }
3586
3587         /*
3588          * If there is a route, mark it accordingly.  If there isn't one,
3589          * we'll get here again during the next transmit (possibly with a
3590          * route) and the flag will get set at that point.  For IPv4 LLA
3591          * destination, mark it only if the route has been fully resolved;
3592          * otherwise we want to come back here again when the route points
3593          * to the interface over which the ARP reply arrives on.
3594          */
3595         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3596             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3597             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3598                 if (ifa != NULL) {
3599                         IFA_ADDREF(ifa);        /* for route */
3600                 }
3601                 if (ro->ro_srcia != NULL) {
3602                         IFA_REMREF(ro->ro_srcia);
3603                 }
3604                 ro->ro_srcia = ifa;
3605                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3606                 RT_GENID_SYNC(ro->ro_rt);
3607         }
3608
3609         if (ro->ro_rt != NULL) {
3610                 RT_UNLOCK(ro->ro_rt);
3611         }
3612
3613         return ifa;
3614 }
3615
3616 /*
3617  * @brief       Given outgoing interface it determines what checksum needs
3618  *      to be computed in software and what needs to be offloaded to the
3619  *      interface.
3620  *
3621  * @param       ifp Pointer to the outgoing interface
3622  * @param       m Pointer to the packet
3623  * @param       hlen IP header length
3624  * @param       ip_len Total packet size i.e. headers + data payload
3625  * @param       sw_csum Pointer to a software checksum flag set
3626  *
3627  * @return      void
3628  */
3629 void
3630 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3631     uint32_t *sw_csum)
3632 {
3633         int tso = TSO_IPV4_OK(ifp, m);
3634         uint32_t hwcap = ifp->if_hwassist;
3635
3636         m->m_pkthdr.csum_flags |= CSUM_IP;
3637
3638         if (!hwcksum_tx) {
3639                 /* do all in software; hardware checksum offload is disabled */
3640                 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3641                     m->m_pkthdr.csum_flags;
3642         } else {
3643                 /* do in software what the hardware cannot */
3644                 *sw_csum = m->m_pkthdr.csum_flags &
3645                     ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3646         }
3647
3648         if (hlen != sizeof(struct ip)) {
3649                 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3650                     m->m_pkthdr.csum_flags);
3651         } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3652                 int interface_mtu = ifp->if_mtu;
3653
3654                 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3655                         interface_mtu = IN6_LINKMTU(ifp);
3656                         /* Further adjust the size for CLAT46 expansion */
3657                         interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3658                 }
3659
3660                 /*
3661                  * Partial checksum offload, if non-IP fragment, and TCP only
3662                  * (no UDP support, as the hardware may not be able to convert
3663                  * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3664                  * supports "invert zero" capability.)
3665                  */
3666                 if (hwcksum_tx && !tso &&
3667                     ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3668                     ((hwcap & CSUM_ZERO_INVERT) &&
3669                     (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3670                     ip_len <= interface_mtu) {
3671                         uint16_t start = sizeof(struct ip);
3672                         uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3673                         m->m_pkthdr.csum_flags |=
3674                             (CSUM_DATA_VALID | CSUM_PARTIAL);
3675                         m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3676                         m->m_pkthdr.csum_tx_start = start;
3677                         /* do IP hdr chksum in software */
3678                         *sw_csum = CSUM_DELAY_IP;
3679                 } else {
3680                         *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3681                 }
3682         }
3683
3684         if (*sw_csum & CSUM_DELAY_DATA) {
3685                 in_delayed_cksum(m);
3686                 *sw_csum &= ~CSUM_DELAY_DATA;
3687         }
3688
3689         if (hwcksum_tx) {
3690                 /*
3691                  * Drop off bits that aren't supported by hardware;
3692                  * also make sure to preserve non-checksum related bits.
3693                  */
3694                 m->m_pkthdr.csum_flags =
3695                     ((m->m_pkthdr.csum_flags &
3696                     (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) |
3697                     (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3698         } else {
3699                 /* drop all bits; hardware checksum offload is disabled */
3700                 m->m_pkthdr.csum_flags = 0;
3701         }
3702 }
3703
3704 /*
3705  * GRE protocol output for PPP/PPTP
3706  */
3707 int
3708 ip_gre_output(struct mbuf *m)
3709 {
3710         struct route ro;
3711         int error;
3712
3713         bzero(&ro, sizeof(ro));
3714
3715         error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3716
3717         ROUTE_RELEASE(&ro);
3718
3719         return error;
3720 }
3721
3722 static int
3723 sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3724 {
3725 #pragma unused(arg1, arg2)
3726         int error, i;
3727
3728         i = ip_output_measure;
3729         error = sysctl_handle_int(oidp, &i, 0, req);
3730         if (error || req->newptr == USER_ADDR_NULL) {
3731                 goto done;
3732         }
3733         /* impose bounds */
3734         if (i < 0 || i > 1) {
3735                 error = EINVAL;
3736                 goto done;
3737         }
3738         if (ip_output_measure != i && i == 1) {
3739                 net_perf_initialize(&net_perf, ip_output_measure_bins);
3740         }
3741         ip_output_measure = i;
3742 done:
3743         return error;
3744 }
3745
3746 static int
3747 sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3748 {
3749 #pragma unused(arg1, arg2)
3750         int error;
3751         uint64_t i;
3752
3753         i = ip_output_measure_bins;
3754         error = sysctl_handle_quad(oidp, &i, 0, req);
3755         if (error || req->newptr == USER_ADDR_NULL) {
3756                 goto done;
3757         }
3758         /* validate data */
3759         if (!net_perf_validate_bins(i)) {
3760                 error = EINVAL;
3761                 goto done;
3762         }
3763         ip_output_measure_bins = i;
3764 done:
3765         return error;
3766 }
3767
3768 static int
3769 sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3770 {
3771 #pragma unused(oidp, arg1, arg2)
3772         if (req->oldptr == USER_ADDR_NULL) {
3773                 req->oldlen = (size_t)sizeof(struct ipstat);
3774         }
3775
3776         return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3777 }