bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  */
  62 /*
  63  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  64  * support for mandatory and extensible security protections.  This notice
  65  * is included in support of clause 2.2 (b) of the Apple Public License,
  66  * Version 2.0.
  67  */
  68
  69 #define _IP_VHL
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/socketvar.h>
  79 #include <kern/locks.h>
  80 #include <sys/sysctl.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kdebug.h>
  83
  84 #include <machine/endian.h>
  85 #include <pexpert/pexpert.h>
  86 #include <mach/sdt.h>
  87
  88 #include <libkern/OSAtomic.h>
  89 #include <libkern/OSByteOrder.h>
  90
  91 #include <net/if.h>
  92 #include <net/if_dl.h>
  93 #include <net/if_types.h>
  94 #include <net/route.h>
  95 #include <net/ntstat.h>
  96 #include <net/net_osdep.h>
  97 #include <net/dlil.h>
  98 #include <net/net_perf.h>
  99
 100 #include <netinet/in.h>
 101 #include <netinet/in_systm.h>
 102 #include <netinet/ip.h>
 103 #include <netinet/in_pcb.h>
 104 #include <netinet/in_var.h>
 105 #include <netinet/ip_var.h>
 106 #include <netinet/kpi_ipfilter_var.h>
 107 #include <netinet/in_tclass.h>
 108 #include <netinet/udp.h>
 109
 110 #include <netinet6/nd6.h>
 111
 112 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 113 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 114 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 115 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 116
 117 #if IPSEC
 118 #include <netinet6/ipsec.h>
 119 #include <netkey/key.h>
 120 #if IPSEC_DEBUG
 121 #include <netkey/key_debug.h>
 122 #else
 123 #define KEYDEBUG(lev, arg)
 124 #endif
 125 #endif /* IPSEC */
 126
 127 #if NECP
 128 #include <net/necp.h>
 129 #endif /* NECP */
 130
 131
 132 #if DUMMYNET
 133 #include <netinet/ip_dummynet.h>
 134 #endif
 135
 136 #if PF
 137 #include <net/pfvar.h>
 138 #endif /* PF */
 139
 140
 141 u_short ip_id;
 142
 143 static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
 144 static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
 145 static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
 146 static void ip_out_cksum_stats(int, u_int32_t);
 147 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 148 static int ip_optcopy(struct ip *, struct ip *);
 149 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
 150 static void imo_trace(struct ip_moptions *, int);
 151 static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
 152     struct sockaddr_in *, int);
 153 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 154
 155 extern struct ip_linklocal_stat ip_linklocal_stat;
 156
 157 /* temporary: for testing */
 158 #if IPSEC
 159 extern int ipsec_bypass;
 160 #endif
 161
 162 static int ip_maxchainsent = 0;
 163 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
 164     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
 165     "use dlil_output_list");
 166 #if DEBUG
 167 static int forge_ce = 0;
 168 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
 169     CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
 170     "Forge ECN CE");
 171 #endif /* DEBUG */
 172
 173 static int ip_select_srcif_debug = 0;
 174 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
 175     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
 176     "log source interface selection debug info");
 177
 178 static int ip_output_measure = 0;
 179 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
 180     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 181     &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
 182     "Do time measurement");
 183
 184 static uint64_t ip_output_measure_bins = 0;
 185 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
 186     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
 187     sysctl_ip_output_measure_bins, "I",
 188     "bins for chaining performance data histogram");
 189
 190 static net_perf_t net_perf;
 191 SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
 192     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 193     0, 0, sysctl_ip_output_getperf, "S,net_perf",
 194     "IP output performance data (struct net_perf, net/net_perf.h)");
 195
 196 __private_extern__ int rfc6864 = 1;
 197 SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
 198     &rfc6864, 0, "updated ip id field behavior");
 199
 200 #define IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 201
 202 /* For gdb */
 203 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
 204
 205 struct ip_moptions_dbg {
 206         struct ip_moptions      imo;                    /* ip_moptions */
 207         u_int16_t               imo_refhold_cnt;        /* # of IMO_ADDREF */
 208         u_int16_t               imo_refrele_cnt;        /* # of IMO_REMREF */
 209         /*
 210          * Alloc and free callers.
 211          */
 212         ctrace_t                imo_alloc;
 213         ctrace_t                imo_free;
 214         /*
 215          * Circular lists of IMO_ADDREF and IMO_REMREF callers.
 216          */
 217         ctrace_t                imo_refhold[IMO_TRACE_HIST_SIZE];
 218         ctrace_t                imo_refrele[IMO_TRACE_HIST_SIZE];
 219 };
 220
 221 #if DEBUG
 222 static unsigned int imo_debug = 1;      /* debugging (enabled) */
 223 #else
 224 static unsigned int imo_debug;          /* debugging (disabled) */
 225 #endif /* !DEBUG */
 226 static struct zone *imo_zone;           /* zone for ip_moptions */
 227 #define IMO_ZONE_NAME           "ip_moptions"   /* zone name */
 228
 229 /*
 230  * IP output.  The packet in mbuf chain m contains a skeletal IP
 231  * header (with len, off, ttl, proto, tos, src, dst).
 232  * The mbuf chain containing the packet will be freed.
 233  * The mbuf opt, if present, will not be freed.
 234  */
 235 int
 236 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
 237     struct ip_moptions *imo, struct ip_out_args *ipoa)
 238 {
 239         return ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
 240 }
 241
 242 /*
 243  * IP output.  The packet in mbuf chain m contains a skeletal IP
 244  * header (with len, off, ttl, proto, tos, src, dst).
 245  * The mbuf chain containing the packet will be freed.
 246  * The mbuf opt, if present, will not be freed.
 247  *
 248  * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
 249  * skipped and ro->ro_rt would be used.  Otherwise the result of route
 250  * lookup is stored in ro->ro_rt.
 251  *
 252  * In the IP forwarding case, the packet will arrive with options already
 253  * inserted, so must have a NULL opt pointer.
 254  */
 255 int
 256 ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
 257     struct route *ro, int flags, struct ip_moptions *imo,
 258     struct ip_out_args *ipoa)
 259 {
 260         struct ip *ip;
 261         struct ifnet *ifp = NULL;               /* not refcnt'd */
 262         struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
 263         int hlen = sizeof(struct ip);
 264         int len = 0, error = 0;
 265         struct sockaddr_in *dst = NULL;
 266         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 267         struct in_addr pkt_dst;
 268         struct ipf_pktopts *ippo = NULL;
 269         ipfilter_t inject_filter_ref = NULL;
 270         struct mbuf *packetlist;
 271         uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
 272         uint32_t packets_processed = 0;
 273         unsigned int ifscope = IFSCOPE_NONE;
 274         struct flowadv *adv = NULL;
 275         struct timeval start_tv;
 276 #if IPSEC
 277         struct socket *so = NULL;
 278         struct secpolicy *sp = NULL;
 279 #endif /* IPSEC */
 280 #if NECP
 281         necp_kernel_policy_result necp_result = 0;
 282         necp_kernel_policy_result_parameter necp_result_parameter;
 283         necp_kernel_policy_id necp_matched_policy_id = 0;
 284 #endif /* NECP */
 285 #if DUMMYNET
 286         struct m_tag *tag;
 287         struct ip_out_args saved_ipoa;
 288         struct sockaddr_in dst_buf;
 289 #endif /* DUMMYNET */
 290         struct {
 291 #if IPSEC
 292                 struct ipsec_output_state ipsec_state;
 293 #endif /* IPSEC */
 294 #if NECP
 295                 struct route necp_route;
 296 #endif /* NECP */
 297 #if DUMMYNET
 298                 struct ip_fw_args args;
 299                 struct route saved_route;
 300 #endif /* DUMMYNET */
 301                 struct ipf_pktopts ipf_pktopts;
 302         } ipobz;
 303 #define ipsec_state     ipobz.ipsec_state
 304 #define necp_route      ipobz.necp_route
 305 #define args            ipobz.args
 306 #define sro_fwd         ipobz.sro_fwd
 307 #define saved_route     ipobz.saved_route
 308 #define ipf_pktopts     ipobz.ipf_pktopts
 309         union {
 310                 struct {
 311                         boolean_t select_srcif : 1;     /* set once */
 312                         boolean_t srcbound : 1;         /* set once */
 313                         boolean_t nocell : 1;           /* set once */
 314                         boolean_t isbroadcast : 1;
 315                         boolean_t didfilter : 1;
 316                         boolean_t noexpensive : 1;      /* set once */
 317                         boolean_t noconstrained : 1;      /* set once */
 318                         boolean_t awdl_unrestricted : 1;        /* set once */
 319                 };
 320                 uint32_t raw;
 321         } ipobf = { .raw = 0 };
 322
 323         int interface_mtu = 0;
 324
 325 /*
 326  * Here we check for restrictions when sending frames.
 327  * N.B.: IPv4 over internal co-processor interfaces is not allowed.
 328  */
 329 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                             \
 330         (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                \
 331          ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||          \
 332          ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) ||      \
 333           (IFNET_IS_INTCOPROC(_ifp)) ||                                 \
 334          (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
 335
 336         if (ip_output_measure) {
 337                 net_perf_start_time(&net_perf, &start_tv);
 338         }
 339         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 340
 341         VERIFY(m0->m_flags & M_PKTHDR);
 342         packetlist = m0;
 343
 344         /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
 345         bzero(&ipobz, sizeof(ipobz));
 346         ippo = &ipf_pktopts;
 347
 348 #if DUMMYNET
 349         if (SLIST_EMPTY(&m0->m_pkthdr.tags)) {
 350                 goto ipfw_tags_done;
 351         }
 352
 353         /* Grab info from mtags prepended to the chain */
 354         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 355             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 356                 struct dn_pkt_tag       *dn_tag;
 357
 358                 dn_tag = (struct dn_pkt_tag *)(tag + 1);
 359                 args.fwa_pf_rule = dn_tag->dn_pf_rule;
 360                 opt = NULL;
 361                 saved_route = dn_tag->dn_ro;
 362                 ro = &saved_route;
 363
 364                 imo = NULL;
 365                 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
 366                 dst = &dst_buf;
 367                 ifp = dn_tag->dn_ifp;
 368                 flags = dn_tag->dn_flags;
 369                 if ((dn_tag->dn_flags & IP_OUTARGS)) {
 370                         saved_ipoa = dn_tag->dn_ipoa;
 371                         ipoa = &saved_ipoa;
 372                 }
 373
 374                 m_tag_delete(m0, tag);
 375         }
 376 ipfw_tags_done:
 377 #endif /* DUMMYNET */
 378
 379         m = m0;
 380         m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP | PKTF_IFAINFO);
 381
 382 #if IPSEC
 383         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 384                 /* If packet is bound to an interface, check bound policies */
 385                 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
 386                     (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 387                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 388                         if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
 389                             &flags, ipoa, &sp) != 0) {
 390                                 goto bad;
 391                         }
 392                 }
 393         }
 394 #endif /* IPSEC */
 395
 396         VERIFY(ro != NULL);
 397
 398         if (flags & IP_OUTARGS) {
 399                 /*
 400                  * In the forwarding case, only the ifscope value is used,
 401                  * as source interface selection doesn't take place.
 402                  */
 403                 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
 404                     (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
 405                         ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
 406                 }
 407
 408                 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
 409                     ipoa->ipoa_boundif != IFSCOPE_NONE) {
 410                         ifscope = ipoa->ipoa_boundif;
 411                         ipf_pktopts.ippo_flags |=
 412                             (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
 413                 }
 414
 415                 /* double negation needed for bool bit field */
 416                 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
 417                 if (ipobf.srcbound) {
 418                         ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
 419                 }
 420         } else {
 421                 ipobf.select_srcif = FALSE;
 422                 ipobf.srcbound = FALSE;
 423                 ifscope = IFSCOPE_NONE;
 424                 if (flags & IP_OUTARGS) {
 425                         ipoa->ipoa_boundif = IFSCOPE_NONE;
 426                         ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
 427                             IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
 428                 }
 429         }
 430
 431         if (flags & IP_OUTARGS) {
 432                 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
 433                         ipobf.nocell = TRUE;
 434                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
 435                 }
 436                 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
 437                         ipobf.noexpensive = TRUE;
 438                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
 439                 }
 440                 if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
 441                         ipobf.noconstrained = TRUE;
 442                         ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
 443                 }
 444                 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
 445                         ipobf.awdl_unrestricted = TRUE;
 446                 }
 447                 adv = &ipoa->ipoa_flowadv;
 448                 adv->code = FADV_SUCCESS;
 449                 ipoa->ipoa_retflags = 0;
 450         }
 451
 452 #if IPSEC
 453         if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
 454                 so = ipsec_getsocket(m);
 455                 if (so != NULL) {
 456                         (void) ipsec_setsocket(m, NULL);
 457                 }
 458         }
 459 #endif /* IPSEC */
 460
 461 #if DUMMYNET
 462         if (args.fwa_pf_rule != NULL) {
 463                 /* dummynet already saw us */
 464                 ip = mtod(m, struct ip *);
 465                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 466                 pkt_dst = ip->ip_dst;
 467                 if (ro->ro_rt != NULL) {
 468                         RT_LOCK_SPIN(ro->ro_rt);
 469                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 470                         if (ia) {
 471                                 /* Become a regular mutex */
 472                                 RT_CONVERT_LOCK(ro->ro_rt);
 473                                 IFA_ADDREF(&ia->ia_ifa);
 474                         }
 475                         RT_UNLOCK(ro->ro_rt);
 476                 }
 477
 478                 if (args.fwa_pf_rule != NULL) {
 479                         goto sendit;
 480                 }
 481         }
 482 #endif /* DUMMYNET */
 483
 484 loopit:
 485         packets_processed++;
 486         ipobf.isbroadcast = FALSE;
 487         ipobf.didfilter = FALSE;
 488
 489         VERIFY(m->m_flags & M_PKTHDR);
 490         /*
 491          * No need to proccess packet twice if we've already seen it.
 492          */
 493         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
 494                 inject_filter_ref = ipf_get_inject_filter(m);
 495         } else {
 496                 inject_filter_ref = NULL;
 497         }
 498
 499         if (opt) {
 500                 m = ip_insertoptions(m, opt, &len);
 501                 hlen = len;
 502                 /* Update the chain */
 503                 if (m != m0) {
 504                         if (m0 == packetlist) {
 505                                 packetlist = m;
 506                         }
 507                         m0 = m;
 508                 }
 509         }
 510         ip = mtod(m, struct ip *);
 511
 512         pkt_dst = ip->ip_dst;
 513
 514         /*
 515          * We must not send if the packet is destined to network zero.
 516          * RFC1122 3.2.1.3 (a) and (b).
 517          */
 518         if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
 519                 error = EHOSTUNREACH;
 520                 goto bad;
 521         }
 522
 523         /*
 524          * Fill in IP header.
 525          */
 526         if (!(flags & (IP_FORWARDING | IP_RAWOUTPUT))) {
 527                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 528                 ip->ip_off &= IP_DF;
 529                 if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
 530                         // Per RFC6864, value of ip_id is undefined for atomic ip packets
 531                         ip->ip_id = 0;
 532                 } else {
 533                         ip->ip_id = ip_randomid();
 534                 }
 535                 OSAddAtomic(1, &ipstat.ips_localout);
 536         } else {
 537                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 538         }
 539
 540 #if DEBUG
 541         /* For debugging, we let the stack forge congestion */
 542         if (forge_ce != 0 &&
 543             ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 544             (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 545                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 546                 forge_ce--;
 547         }
 548 #endif /* DEBUG */
 549
 550         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
 551             ip->ip_p, ip->ip_off, ip->ip_len);
 552
 553         dst = SIN(&ro->ro_dst);
 554
 555         /*
 556          * If there is a cached route,
 557          * check that it is to the same destination
 558          * and is still up.  If not, free it and try again.
 559          * The address family should also be checked in case of sharing the
 560          * cache with IPv6.
 561          */
 562
 563         if (ro->ro_rt != NULL) {
 564                 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
 565                     !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
 566                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 567                         if (src_ia == NULL) {
 568                                 error = EADDRNOTAVAIL;
 569                                 goto bad;
 570                         }
 571                         IFA_REMREF(&src_ia->ia_ifa);
 572                         src_ia = NULL;
 573                 }
 574                 /*
 575                  * Test rt_flags without holding rt_lock for performance
 576                  * reasons; if the route is down it will hopefully be
 577                  * caught by the layer below (since it uses this route
 578                  * as a hint) or during the next transmit.
 579                  */
 580                 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
 581                     dst->sin_addr.s_addr != pkt_dst.s_addr) {
 582                         ROUTE_RELEASE(ro);
 583                 }
 584
 585                 /*
 586                  * If we're doing source interface selection, we may not
 587                  * want to use this route; only synch up the generation
 588                  * count otherwise.
 589                  */
 590                 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
 591                     RT_GENID_OUTOFSYNC(ro->ro_rt)) {
 592                         RT_GENID_SYNC(ro->ro_rt);
 593                 }
 594         }
 595         if (ro->ro_rt == NULL) {
 596                 bzero(dst, sizeof(*dst));
 597                 dst->sin_family = AF_INET;
 598                 dst->sin_len = sizeof(*dst);
 599                 dst->sin_addr = pkt_dst;
 600         }
 601         /*
 602          * If routing to interface only,
 603          * short circuit routing lookup.
 604          */
 605         if (flags & IP_ROUTETOIF) {
 606                 if (ia != NULL) {
 607                         IFA_REMREF(&ia->ia_ifa);
 608                 }
 609                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
 610                         ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
 611                         if (ia == NULL) {
 612                                 OSAddAtomic(1, &ipstat.ips_noroute);
 613                                 error = ENETUNREACH;
 614                                 /* XXX IPv6 APN fallback notification?? */
 615                                 goto bad;
 616                         }
 617                 }
 618                 ifp = ia->ia_ifp;
 619                 ip->ip_ttl = 1;
 620                 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 621                 /*
 622                  * For consistency with other cases below.  Loopback
 623                  * multicast case is handled separately by ip_mloopback().
 624                  */
 625                 if ((ifp->if_flags & IFF_LOOPBACK) &&
 626                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 627                         m->m_pkthdr.rcvif = ifp;
 628                         ip_setsrcifaddr_info(m, ifp->if_index, NULL);
 629                         ip_setdstifaddr_info(m, ifp->if_index, NULL);
 630                 }
 631         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 632             imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
 633                 /*
 634                  * Bypass the normal routing lookup for multicast
 635                  * packets if the interface is specified.
 636                  */
 637                 ipobf.isbroadcast = FALSE;
 638                 if (ia != NULL) {
 639                         IFA_REMREF(&ia->ia_ifa);
 640                 }
 641
 642                 /* Macro takes reference on ia */
 643                 IFP_TO_IA(ifp, ia);
 644         } else {
 645                 struct ifaddr *ia0 = NULL;
 646                 boolean_t cloneok = FALSE;
 647                 /*
 648                  * Perform source interface selection; the source IP address
 649                  * must belong to one of the addresses of the interface used
 650                  * by the route.  For performance reasons, do this only if
 651                  * there is no route, or if the routing table has changed,
 652                  * or if we haven't done source interface selection on this
 653                  * route (for this PCB instance) before.
 654                  */
 655                 if (ipobf.select_srcif &&
 656                     ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
 657                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 658                         /* Find the source interface */
 659                         ia0 = in_selectsrcif(ip, ro, ifscope);
 660
 661                         /*
 662                          * If the source address belongs to a restricted
 663                          * interface and the caller forbids our using
 664                          * interfaces of such type, pretend that there is no
 665                          * route.
 666                          */
 667                         if (ia0 != NULL &&
 668                             IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
 669                                 IFA_REMREF(ia0);
 670                                 ia0 = NULL;
 671                                 error = EHOSTUNREACH;
 672                                 if (flags & IP_OUTARGS) {
 673                                         ipoa->ipoa_retflags |= IPOARF_IFDENIED;
 674                                 }
 675                                 goto bad;
 676                         }
 677
 678                         /*
 679                          * If the source address is spoofed (in the case of
 680                          * IP_RAWOUTPUT on an unbounded socket), or if this
 681                          * is destined for local/loopback, just let it go out
 682                          * using the interface of the route.  Otherwise,
 683                          * there's no interface having such an address,
 684                          * so bail out.
 685                          */
 686                         if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
 687                             ipobf.srcbound) && ifscope != lo_ifp->if_index) {
 688                                 error = EADDRNOTAVAIL;
 689                                 goto bad;
 690                         }
 691
 692                         /*
 693                          * If the caller didn't explicitly specify the scope,
 694                          * pick it up from the source interface.  If the cached
 695                          * route was wrong and was blown away as part of source
 696                          * interface selection, don't mask out RTF_PRCLONING
 697                          * since that route may have been allocated by the ULP,
 698                          * unless the IP header was created by the caller or
 699                          * the destination is IPv4 LLA.  The check for the
 700                          * latter is needed because IPv4 LLAs are never scoped
 701                          * in the current implementation, and we don't want to
 702                          * replace the resolved IPv4 LLA route with one whose
 703                          * gateway points to that of the default gateway on
 704                          * the primary interface of the system.
 705                          */
 706                         if (ia0 != NULL) {
 707                                 if (ifscope == IFSCOPE_NONE) {
 708                                         ifscope = ia0->ifa_ifp->if_index;
 709                                 }
 710                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 711                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 712                         }
 713                 }
 714
 715                 /*
 716                  * If this is the case, we probably don't want to allocate
 717                  * a protocol-cloned route since we didn't get one from the
 718                  * ULP.  This lets TCP do its thing, while not burdening
 719                  * forwarding or ICMP with the overhead of cloning a route.
 720                  * Of course, we still want to do any cloning requested by
 721                  * the link layer, as this is probably required in all cases
 722                  * for correct operation (as it is for ARP).
 723                  */
 724                 if (ro->ro_rt == NULL) {
 725                         uint32_t ign = RTF_PRCLONING;
 726                         /*
 727                          * We make an exception here: if the destination
 728                          * address is INADDR_BROADCAST, allocate a protocol-
 729                          * cloned host route so that we end up with a route
 730                          * marked with the RTF_BROADCAST flag.  Otherwise,
 731                          * we would end up referring to the default route,
 732                          * instead of creating a cloned host route entry.
 733                          * That would introduce inconsistencies between ULPs
 734                          * that allocate a route and those that don't.  The
 735                          * RTF_BROADCAST route is important since we'd want
 736                          * to send out undirected IP broadcast packets using
 737                          * link-level broadcast address. Another exception
 738                          * is for ULP-created routes that got blown away by
 739                          * source interface selection (see above).
 740                          *
 741                          * These exceptions will no longer be necessary when
 742                          * the RTF_PRCLONING scheme is no longer present.
 743                          */
 744                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) {
 745                                 ign &= ~RTF_PRCLONING;
 746                         }
 747
 748                         /*
 749                          * Loosen the route lookup criteria if the ifscope
 750                          * corresponds to the loopback interface; this is
 751                          * needed to support Application Layer Gateways
 752                          * listening on loopback, in conjunction with packet
 753                          * filter redirection rules.  The final source IP
 754                          * address will be rewritten by the packet filter
 755                          * prior to the RFC1122 loopback check below.
 756                          */
 757                         if (ifscope == lo_ifp->if_index) {
 758                                 rtalloc_ign(ro, ign);
 759                         } else {
 760                                 rtalloc_scoped_ign(ro, ign, ifscope);
 761                         }
 762
 763                         /*
 764                          * If the route points to a cellular/expensive interface
 765                          * and the caller forbids our using interfaces of such type,
 766                          * pretend that there is no route.
 767                          */
 768                         if (ro->ro_rt != NULL) {
 769                                 RT_LOCK_SPIN(ro->ro_rt);
 770                                 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
 771                                     ipobf)) {
 772                                         RT_UNLOCK(ro->ro_rt);
 773                                         ROUTE_RELEASE(ro);
 774                                         if (flags & IP_OUTARGS) {
 775                                                 ipoa->ipoa_retflags |=
 776                                                     IPOARF_IFDENIED;
 777                                         }
 778                                 } else {
 779                                         RT_UNLOCK(ro->ro_rt);
 780                                 }
 781                         }
 782                 }
 783
 784                 if (ro->ro_rt == NULL) {
 785                         OSAddAtomic(1, &ipstat.ips_noroute);
 786                         error = EHOSTUNREACH;
 787                         if (ia0 != NULL) {
 788                                 IFA_REMREF(ia0);
 789                                 ia0 = NULL;
 790                         }
 791                         goto bad;
 792                 }
 793
 794                 if (ia != NULL) {
 795                         IFA_REMREF(&ia->ia_ifa);
 796                 }
 797                 RT_LOCK_SPIN(ro->ro_rt);
 798                 ia = ifatoia(ro->ro_rt->rt_ifa);
 799                 if (ia != NULL) {
 800                         /* Become a regular mutex */
 801                         RT_CONVERT_LOCK(ro->ro_rt);
 802                         IFA_ADDREF(&ia->ia_ifa);
 803                 }
 804                 /*
 805                  * Note: ia_ifp may not be the same as rt_ifp; the latter
 806                  * is what we use for determining outbound i/f, mtu, etc.
 807                  */
 808                 ifp = ro->ro_rt->rt_ifp;
 809                 ro->ro_rt->rt_use++;
 810                 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
 811                         dst = SIN(ro->ro_rt->rt_gateway);
 812                 }
 813                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 814                         /* double negation needed for bool bit field */
 815                         ipobf.isbroadcast =
 816                             !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
 817                 } else {
 818                         /* Become a regular mutex */
 819                         RT_CONVERT_LOCK(ro->ro_rt);
 820                         ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
 821                 }
 822                 /*
 823                  * For consistency with IPv6, as well as to ensure that
 824                  * IP_RECVIF is set correctly for packets that are sent
 825                  * to one of the local addresses.  ia (rt_ifa) would have
 826                  * been fixed up by rt_setif for local routes.  This
 827                  * would make it appear as if the packet arrives on the
 828                  * interface which owns the local address.  Loopback
 829                  * multicast case is handled separately by ip_mloopback().
 830                  */
 831                 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
 832                     !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 833                         uint32_t srcidx;
 834
 835                         m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
 836
 837                         if (ia0 != NULL) {
 838                                 srcidx = ia0->ifa_ifp->if_index;
 839                         } else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
 840                             ro->ro_srcia != NULL) {
 841                                 srcidx = ro->ro_srcia->ifa_ifp->if_index;
 842                         } else {
 843                                 srcidx = 0;
 844                         }
 845
 846                         ip_setsrcifaddr_info(m, srcidx, NULL);
 847                         ip_setdstifaddr_info(m, 0, ia);
 848                 }
 849                 RT_UNLOCK(ro->ro_rt);
 850                 if (ia0 != NULL) {
 851                         IFA_REMREF(ia0);
 852                         ia0 = NULL;
 853                 }
 854         }
 855
 856         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 857                 struct ifnet *srcifp = NULL;
 858                 struct in_multi *inm;
 859                 u_int32_t vif = 0;
 860                 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
 861                 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 862
 863                 m->m_flags |= M_MCAST;
 864                 /*
 865                  * IP destination address is multicast.  Make sure "dst"
 866                  * still points to the address in "ro".  (It may have been
 867                  * changed to point to a gateway address, above.)
 868                  */
 869                 dst = SIN(&ro->ro_dst);
 870                 /*
 871                  * See if the caller provided any multicast options
 872                  */
 873                 if (imo != NULL) {
 874                         IMO_LOCK(imo);
 875                         vif = imo->imo_multicast_vif;
 876                         ttl = imo->imo_multicast_ttl;
 877                         loop = imo->imo_multicast_loop;
 878                         if (!(flags & IP_RAWOUTPUT)) {
 879                                 ip->ip_ttl = ttl;
 880                         }
 881                         if (imo->imo_multicast_ifp != NULL) {
 882                                 ifp = imo->imo_multicast_ifp;
 883                         }
 884                         IMO_UNLOCK(imo);
 885                 } else if (!(flags & IP_RAWOUTPUT)) {
 886                         vif = -1;
 887                         ip->ip_ttl = ttl;
 888                 }
 889                 /*
 890                  * Confirm that the outgoing interface supports multicast.
 891                  */
 892                 if (imo == NULL || vif == -1) {
 893                         if (!(ifp->if_flags & IFF_MULTICAST)) {
 894                                 OSAddAtomic(1, &ipstat.ips_noroute);
 895                                 error = ENETUNREACH;
 896                                 goto bad;
 897                         }
 898                 }
 899                 /*
 900                  * If source address not specified yet, use address
 901                  * of outgoing interface.
 902                  */
 903                 if (ip->ip_src.s_addr == INADDR_ANY) {
 904                         struct in_ifaddr *ia1;
 905                         lck_rw_lock_shared(in_ifaddr_rwlock);
 906                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
 907                                 IFA_LOCK_SPIN(&ia1->ia_ifa);
 908                                 if (ia1->ia_ifp == ifp) {
 909                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 910                                         srcifp = ifp;
 911                                         IFA_UNLOCK(&ia1->ia_ifa);
 912                                         break;
 913                                 }
 914                                 IFA_UNLOCK(&ia1->ia_ifa);
 915                         }
 916                         lck_rw_done(in_ifaddr_rwlock);
 917                         if (ip->ip_src.s_addr == INADDR_ANY) {
 918                                 error = ENETUNREACH;
 919                                 goto bad;
 920                         }
 921                 }
 922
 923                 in_multihead_lock_shared();
 924                 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
 925                 in_multihead_lock_done();
 926                 if (inm != NULL && (imo == NULL || loop)) {
 927                         /*
 928                          * If we belong to the destination multicast group
 929                          * on the outgoing interface, and the caller did not
 930                          * forbid loopback, loop back a copy.
 931                          */
 932                         if (!TAILQ_EMPTY(&ipv4_filters)
 933 #if NECP
 934                             && !necp_packet_should_skip_filters(m)
 935 #endif // NECP
 936                             ) {
 937                                 struct ipfilter *filter;
 938                                 int seen = (inject_filter_ref == NULL);
 939
 940                                 if (imo != NULL) {
 941                                         ipf_pktopts.ippo_flags |=
 942                                             IPPOF_MCAST_OPTS;
 943                                         ipf_pktopts.ippo_mcast_ifnet = ifp;
 944                                         ipf_pktopts.ippo_mcast_ttl = ttl;
 945                                         ipf_pktopts.ippo_mcast_loop = loop;
 946                                 }
 947
 948                                 ipf_ref();
 949
 950                                 /*
 951                                  * 4135317 - always pass network byte
 952                                  * order to filter
 953                                  */
 954 #if BYTE_ORDER != BIG_ENDIAN
 955                                 HTONS(ip->ip_len);
 956                                 HTONS(ip->ip_off);
 957 #endif
 958                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 959                                         if (seen == 0) {
 960                                                 if ((struct ipfilter *)
 961                                                     inject_filter_ref == filter) {
 962                                                         seen = 1;
 963                                                 }
 964                                         } else if (filter->ipf_filter.
 965                                             ipf_output != NULL) {
 966                                                 errno_t result;
 967                                                 result = filter->ipf_filter.
 968                                                     ipf_output(filter->
 969                                                     ipf_filter.cookie,
 970                                                     (mbuf_t *)&m, ippo);
 971                                                 if (result == EJUSTRETURN) {
 972                                                         ipf_unref();
 973                                                         INM_REMREF(inm);
 974                                                         goto done;
 975                                                 }
 976                                                 if (result != 0) {
 977                                                         ipf_unref();
 978                                                         INM_REMREF(inm);
 979                                                         goto bad;
 980                                                 }
 981                                         }
 982                                 }
 983
 984                                 /* set back to host byte order */
 985                                 ip = mtod(m, struct ip *);
 986 #if BYTE_ORDER != BIG_ENDIAN
 987                                 NTOHS(ip->ip_len);
 988                                 NTOHS(ip->ip_off);
 989 #endif
 990                                 ipf_unref();
 991                                 ipobf.didfilter = TRUE;
 992                         }
 993                         ip_mloopback(srcifp, ifp, m, dst, hlen);
 994                 }
 995                 if (inm != NULL) {
 996                         INM_REMREF(inm);
 997                 }
 998                 /*
 999                  * Multicasts with a time-to-live of zero may be looped-
1000                  * back, above, but must not be transmitted on a network.
1001                  * Also, multicasts addressed to the loopback interface
1002                  * are not sent -- the above call to ip_mloopback() will
1003                  * loop back a copy if this host actually belongs to the
1004                  * destination group on the loopback interface.
1005                  */
1006                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1007                         m_freem(m);
1008                         goto done;
1009                 }
1010
1011                 goto sendit;
1012         }
1013         /*
1014          * If source address not specified yet, use address
1015          * of outgoing interface.
1016          */
1017         if (ip->ip_src.s_addr == INADDR_ANY) {
1018                 IFA_LOCK_SPIN(&ia->ia_ifa);
1019                 ip->ip_src = IA_SIN(ia)->sin_addr;
1020                 IFA_UNLOCK(&ia->ia_ifa);
1021         }
1022
1023         /*
1024          * Look for broadcast address and
1025          * and verify user is allowed to send
1026          * such a packet.
1027          */
1028         if (ipobf.isbroadcast) {
1029                 if (!(ifp->if_flags & IFF_BROADCAST)) {
1030                         error = EADDRNOTAVAIL;
1031                         goto bad;
1032                 }
1033                 if (!(flags & IP_ALLOWBROADCAST)) {
1034                         error = EACCES;
1035                         goto bad;
1036                 }
1037                 /* don't allow broadcast messages to be fragmented */
1038                 if ((u_short)ip->ip_len > ifp->if_mtu) {
1039                         error = EMSGSIZE;
1040                         goto bad;
1041                 }
1042                 m->m_flags |= M_BCAST;
1043         } else {
1044                 m->m_flags &= ~M_BCAST;
1045         }
1046
1047 sendit:
1048 #if PF
1049         /* Invoke outbound packet filter */
1050         if (PF_IS_ENABLED) {
1051                 int rc;
1052
1053                 m0 = m; /* Save for later */
1054 #if DUMMYNET
1055                 args.fwa_m = m;
1056                 args.fwa_oif = ifp;
1057                 args.fwa_ro = ro;
1058                 args.fwa_dst = dst;
1059                 args.fwa_oflags = flags;
1060                 if (flags & IP_OUTARGS) {
1061                         args.fwa_ipoa = ipoa;
1062                 }
1063                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args);
1064 #else /* DUMMYNET */
1065                 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1066 #endif /* DUMMYNET */
1067                 if (rc != 0 || m == NULL) {
1068                         /* Move to the next packet */
1069                         m = *mppn;
1070
1071                         /* Skip ahead if first packet in list got dropped */
1072                         if (packetlist == m0) {
1073                                 packetlist = m;
1074                         }
1075
1076                         if (m != NULL) {
1077                                 m0 = m;
1078                                 /* Next packet in the chain */
1079                                 goto loopit;
1080                         } else if (packetlist != NULL) {
1081                                 /* No more packet; send down the chain */
1082                                 goto sendchain;
1083                         }
1084                         /* Nothing left; we're done */
1085                         goto done;
1086                 }
1087                 m0 = m;
1088                 ip = mtod(m, struct ip *);
1089                 pkt_dst = ip->ip_dst;
1090                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1091         }
1092 #endif /* PF */
1093         /*
1094          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1095          */
1096         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1097             IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1098                 ip_linklocal_stat.iplls_out_total++;
1099                 if (ip->ip_ttl != MAXTTL) {
1100                         ip_linklocal_stat.iplls_out_badttl++;
1101                         ip->ip_ttl = MAXTTL;
1102                 }
1103         }
1104
1105         if (!ipobf.didfilter &&
1106             !TAILQ_EMPTY(&ipv4_filters)
1107 #if NECP
1108             && !necp_packet_should_skip_filters(m)
1109 #endif // NECP
1110             ) {
1111                 struct ipfilter *filter;
1112                 int seen = (inject_filter_ref == NULL);
1113                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1114
1115                 /*
1116                  * Check that a TSO frame isn't passed to a filter.
1117                  * This could happen if a filter is inserted while
1118                  * TCP is sending the TSO packet.
1119                  */
1120                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1121                         error = EMSGSIZE;
1122                         goto bad;
1123                 }
1124
1125                 ipf_ref();
1126
1127                 /* 4135317 - always pass network byte order to filter */
1128 #if BYTE_ORDER != BIG_ENDIAN
1129                 HTONS(ip->ip_len);
1130                 HTONS(ip->ip_off);
1131 #endif
1132                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1133                         if (seen == 0) {
1134                                 if ((struct ipfilter *)inject_filter_ref ==
1135                                     filter) {
1136                                         seen = 1;
1137                                 }
1138                         } else if (filter->ipf_filter.ipf_output) {
1139                                 errno_t result;
1140                                 result = filter->ipf_filter.
1141                                     ipf_output(filter->ipf_filter.cookie,
1142                                     (mbuf_t *)&m, ippo);
1143                                 if (result == EJUSTRETURN) {
1144                                         ipf_unref();
1145                                         goto done;
1146                                 }
1147                                 if (result != 0) {
1148                                         ipf_unref();
1149                                         goto bad;
1150                                 }
1151                         }
1152                 }
1153                 /* set back to host byte order */
1154                 ip = mtod(m, struct ip *);
1155 #if BYTE_ORDER != BIG_ENDIAN
1156                 NTOHS(ip->ip_len);
1157                 NTOHS(ip->ip_off);
1158 #endif
1159                 ipf_unref();
1160         }
1161
1162 #if NECP
1163         /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1164         necp_matched_policy_id = necp_ip_output_find_policy_match(m,
1165             flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter);
1166         if (necp_matched_policy_id) {
1167                 necp_mark_packet_from_ip(m, necp_matched_policy_id);
1168                 switch (necp_result) {
1169                 case NECP_KERNEL_POLICY_RESULT_PASS:
1170                         if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) {
1171                                 break;
1172                         }
1173                         /* Check if the interface is allowed */
1174                         if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1175                                 error = EHOSTUNREACH;
1176                                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1177                                 goto bad;
1178                         }
1179                         goto skip_ipsec;
1180                 case NECP_KERNEL_POLICY_RESULT_DROP:
1181                 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1182                         /* Flow divert packets should be blocked at the IP layer */
1183                         error = EHOSTUNREACH;
1184                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1185                         goto bad;
1186                 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1187                         /* Verify that the packet is being routed to the tunnel */
1188                         struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
1189                         if (policy_ifp == ifp) {
1190                                 /* Check if the interface is allowed */
1191                                 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1192                                         error = EHOSTUNREACH;
1193                                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1194                                         goto bad;
1195                                 }
1196                                 goto skip_ipsec;
1197                         } else {
1198                                 if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
1199                                         /* Check if the interface is allowed */
1200                                         if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
1201                                                 error = EHOSTUNREACH;
1202                                                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1203                                                 goto bad;
1204                                         }
1205
1206                                         /*
1207                                          * Update the QOS marking policy if
1208                                          * 1. up layer asks it to do so
1209                                          * 2. net_qos_policy_restricted is not set
1210                                          * 3. qos_marking_gencount doesn't match necp_kernel_socket_policies_gencount (checked in necp_lookup_current_qos_marking)
1211                                          */
1212                                         if (ipoa != NULL &&
1213                                             (ipoa->ipoa_flags & IPOAF_REDO_QOSMARKING_POLICY) &&
1214                                             net_qos_policy_restricted != 0) {
1215                                                 bool qos_marking = (ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED) ? TRUE : FALSE;
1216                                                 qos_marking = necp_lookup_current_qos_marking(&ipoa->qos_marking_gencount, NULL, policy_ifp, necp_result_parameter.route_rule_id, qos_marking);
1217                                                 if (qos_marking) {
1218                                                         ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1219                                                 } else {
1220                                                         ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
1221                                                 }
1222                                         }
1223
1224                                         /* Set ifp to the tunnel interface, since it is compatible with the packet */
1225                                         ifp = policy_ifp;
1226                                         ro = &necp_route;
1227                                         goto skip_ipsec;
1228                                 } else {
1229                                         error = ENETUNREACH;
1230                                         OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1231                                         goto bad;
1232                                 }
1233                         }
1234                 }
1235                 default:
1236                         break;
1237                 }
1238         }
1239         /* Catch-all to check if the interface is allowed */
1240         if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1241                 error = EHOSTUNREACH;
1242                 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1243                 goto bad;
1244         }
1245 #endif /* NECP */
1246
1247 #if IPSEC
1248         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) {
1249                 goto skip_ipsec;
1250         }
1251
1252         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1253
1254         if (sp == NULL) {
1255                 /* get SP for this packet */
1256                 if (so != NULL) {
1257                         sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1258                             so, &error);
1259                 } else {
1260                         sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1261                             flags, &error);
1262                 }
1263                 if (sp == NULL) {
1264                         IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1265                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1266                             0, 0, 0, 0, 0);
1267                         goto bad;
1268                 }
1269         }
1270
1271         error = 0;
1272
1273         /* check policy */
1274         switch (sp->policy) {
1275         case IPSEC_POLICY_DISCARD:
1276         case IPSEC_POLICY_GENERATE:
1277                 /*
1278                  * This packet is just discarded.
1279                  */
1280                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1281                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1282                     1, 0, 0, 0, 0);
1283                 goto bad;
1284
1285         case IPSEC_POLICY_BYPASS:
1286         case IPSEC_POLICY_NONE:
1287                 /* no need to do IPsec. */
1288                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1289                     2, 0, 0, 0, 0);
1290                 goto skip_ipsec;
1291
1292         case IPSEC_POLICY_IPSEC:
1293                 if (sp->req == NULL) {
1294                         /* acquire a policy */
1295                         error = key_spdacquire(sp);
1296                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1297                             3, 0, 0, 0, 0);
1298                         goto bad;
1299                 }
1300                 if (sp->ipsec_if) {
1301                         /* Verify the redirect to ipsec interface */
1302                         if (sp->ipsec_if == ifp) {
1303                                 goto skip_ipsec;
1304                         }
1305                         goto bad;
1306                 }
1307                 break;
1308
1309         case IPSEC_POLICY_ENTRUST:
1310         default:
1311                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1312         }
1313         {
1314                 ipsec_state.m = m;
1315                 if (flags & IP_ROUTETOIF) {
1316                         bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1317                 } else {
1318                         route_copyout((struct route *)&ipsec_state.ro, ro, sizeof(struct route));
1319                 }
1320                 ipsec_state.dst = SA(dst);
1321
1322                 ip->ip_sum = 0;
1323
1324                 /*
1325                  * XXX
1326                  * delayed checksums are not currently compatible with IPsec
1327                  */
1328                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1329                         in_delayed_cksum(m);
1330                 }
1331
1332 #if BYTE_ORDER != BIG_ENDIAN
1333                 HTONS(ip->ip_len);
1334                 HTONS(ip->ip_off);
1335 #endif
1336
1337                 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1338                     struct ip *, ip, struct ifnet *, ifp,
1339                     struct ip *, ip, struct ip6_hdr *, NULL);
1340
1341                 error = ipsec4_output(&ipsec_state, sp, flags);
1342                 if (ipsec_state.tunneled == 6) {
1343                         m0 = m = NULL;
1344                         error = 0;
1345                         goto bad;
1346                 }
1347
1348                 m0 = m = ipsec_state.m;
1349
1350 #if DUMMYNET
1351                 /*
1352                  * If we're about to use the route in ipsec_state
1353                  * and this came from dummynet, cleaup now.
1354                  */
1355                 if (ro == &saved_route &&
1356                     (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) {
1357                         ROUTE_RELEASE(ro);
1358                 }
1359 #endif /* DUMMYNET */
1360
1361                 if (flags & IP_ROUTETOIF) {
1362                         /*
1363                          * if we have tunnel mode SA, we may need to ignore
1364                          * IP_ROUTETOIF.
1365                          */
1366                         if (ipsec_state.tunneled) {
1367                                 flags &= ~IP_ROUTETOIF;
1368                                 ro = (struct route *)&ipsec_state.ro;
1369                         }
1370                 } else {
1371                         ro = (struct route *)&ipsec_state.ro;
1372                 }
1373                 dst = SIN(ipsec_state.dst);
1374                 if (error) {
1375                         /* mbuf is already reclaimed in ipsec4_output. */
1376                         m0 = NULL;
1377                         switch (error) {
1378                         case EHOSTUNREACH:
1379                         case ENETUNREACH:
1380                         case EMSGSIZE:
1381                         case ENOBUFS:
1382                         case ENOMEM:
1383                                 break;
1384                         default:
1385                                 printf("ip4_output (ipsec): error code %d\n", error);
1386                                 OS_FALLTHROUGH;
1387                         case ENOENT:
1388                                 /* don't show these error codes to the user */
1389                                 error = 0;
1390                                 break;
1391                         }
1392                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1393                             4, 0, 0, 0, 0);
1394                         goto bad;
1395                 }
1396         }
1397
1398         /* be sure to update variables that are affected by ipsec4_output() */
1399         ip = mtod(m, struct ip *);
1400
1401 #ifdef _IP_VHL
1402         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1403 #else /* !_IP_VHL */
1404         hlen = ip->ip_hl << 2;
1405 #endif /* !_IP_VHL */
1406         /* Check that there wasn't a route change and src is still valid */
1407         if (ROUTE_UNUSABLE(ro)) {
1408                 ROUTE_RELEASE(ro);
1409                 VERIFY(src_ia == NULL);
1410                 if (ip->ip_src.s_addr != INADDR_ANY &&
1411                     !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1412                     (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1413                         error = EADDRNOTAVAIL;
1414                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1415                             5, 0, 0, 0, 0);
1416                         goto bad;
1417                 }
1418                 if (src_ia != NULL) {
1419                         IFA_REMREF(&src_ia->ia_ifa);
1420                         src_ia = NULL;
1421                 }
1422         }
1423
1424         if (ro->ro_rt == NULL) {
1425                 if (!(flags & IP_ROUTETOIF)) {
1426                         printf("%s: can't update route after "
1427                             "IPsec processing\n", __func__);
1428                         error = EHOSTUNREACH;   /* XXX */
1429                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1430                             6, 0, 0, 0, 0);
1431                         goto bad;
1432                 }
1433         } else {
1434                 if (ia != NULL) {
1435                         IFA_REMREF(&ia->ia_ifa);
1436                 }
1437                 RT_LOCK_SPIN(ro->ro_rt);
1438                 ia = ifatoia(ro->ro_rt->rt_ifa);
1439                 if (ia != NULL) {
1440                         /* Become a regular mutex */
1441                         RT_CONVERT_LOCK(ro->ro_rt);
1442                         IFA_ADDREF(&ia->ia_ifa);
1443                 }
1444                 ifp = ro->ro_rt->rt_ifp;
1445                 RT_UNLOCK(ro->ro_rt);
1446         }
1447
1448         /* make it flipped, again. */
1449 #if BYTE_ORDER != BIG_ENDIAN
1450         NTOHS(ip->ip_len);
1451         NTOHS(ip->ip_off);
1452 #endif
1453         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1454             7, 0xff, 0xff, 0xff, 0xff);
1455
1456         /* Pass to filters again */
1457         if (!TAILQ_EMPTY(&ipv4_filters)
1458 #if NECP
1459             && !necp_packet_should_skip_filters(m)
1460 #endif // NECP
1461             ) {
1462                 struct ipfilter *filter;
1463
1464                 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1465
1466                 /*
1467                  * Check that a TSO frame isn't passed to a filter.
1468                  * This could happen if a filter is inserted while
1469                  * TCP is sending the TSO packet.
1470                  */
1471                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1472                         error = EMSGSIZE;
1473                         goto bad;
1474                 }
1475
1476                 ipf_ref();
1477
1478                 /* 4135317 - always pass network byte order to filter */
1479 #if BYTE_ORDER != BIG_ENDIAN
1480                 HTONS(ip->ip_len);
1481                 HTONS(ip->ip_off);
1482 #endif
1483                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1484                         if (filter->ipf_filter.ipf_output) {
1485                                 errno_t result;
1486                                 result = filter->ipf_filter.
1487                                     ipf_output(filter->ipf_filter.cookie,
1488                                     (mbuf_t *)&m, ippo);
1489                                 if (result == EJUSTRETURN) {
1490                                         ipf_unref();
1491                                         goto done;
1492                                 }
1493                                 if (result != 0) {
1494                                         ipf_unref();
1495                                         goto bad;
1496                                 }
1497                         }
1498                 }
1499                 /* set back to host byte order */
1500                 ip = mtod(m, struct ip *);
1501 #if BYTE_ORDER != BIG_ENDIAN
1502                 NTOHS(ip->ip_len);
1503                 NTOHS(ip->ip_off);
1504 #endif
1505                 ipf_unref();
1506         }
1507 skip_ipsec:
1508 #endif /* IPSEC */
1509
1510
1511         /* 127/8 must not appear on wire - RFC1122 */
1512         if (!(ifp->if_flags & IFF_LOOPBACK) &&
1513             ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1514             (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1515                 OSAddAtomic(1, &ipstat.ips_badaddr);
1516                 error = EADDRNOTAVAIL;
1517                 goto bad;
1518         }
1519
1520         if (ipoa != NULL) {
1521                 u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1522
1523                 error = set_packet_qos(m, ifp,
1524                     ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1525                     ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1526                 if (error == 0) {
1527                         ip->ip_tos &= IPTOS_ECN_MASK;
1528                         ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
1529                 } else {
1530                         printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1531                         error = 0;
1532                 }
1533         }
1534
1535         ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1536             ip->ip_len, &sw_csum);
1537
1538         interface_mtu = ifp->if_mtu;
1539
1540         if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1541                 interface_mtu = IN6_LINKMTU(ifp);
1542                 /* Further adjust the size for CLAT46 expansion */
1543                 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1544         }
1545
1546         /*
1547          * If small enough for interface, or the interface will take
1548          * care of the fragmentation for us, can just send directly.
1549          */
1550         if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1551             (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1552 #if BYTE_ORDER != BIG_ENDIAN
1553                 HTONS(ip->ip_len);
1554                 HTONS(ip->ip_off);
1555 #endif
1556
1557                 ip->ip_sum = 0;
1558                 if (sw_csum & CSUM_DELAY_IP) {
1559                         ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1560                         sw_csum &= ~CSUM_DELAY_IP;
1561                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1562                 }
1563
1564 #if IPSEC
1565                 /* clean ipsec history once it goes out of the node */
1566                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1567                         ipsec_delaux(m);
1568                 }
1569 #endif /* IPSEC */
1570                 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1571                     (m->m_pkthdr.tso_segsz > 0)) {
1572                         scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1573                 } else {
1574                         scnt++;
1575                 }
1576
1577                 if (packetchain == 0) {
1578                         if (ro->ro_rt != NULL && nstat_collect) {
1579                                 nstat_route_tx(ro->ro_rt, scnt,
1580                                     m->m_pkthdr.len, 0);
1581                         }
1582
1583                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1584                             SA(dst), 0, adv);
1585                         if (dlil_verbose && error) {
1586                                 printf("dlil_output error on interface %s: %d\n",
1587                                     ifp->if_xname, error);
1588                         }
1589                         scnt = 0;
1590                         goto done;
1591                 } else {
1592                         /*
1593                          * packet chaining allows us to reuse the
1594                          * route for all packets
1595                          */
1596                         bytecnt += m->m_pkthdr.len;
1597                         mppn = &m->m_nextpkt;
1598                         m = m->m_nextpkt;
1599                         if (m == NULL) {
1600 #if PF
1601 sendchain:
1602 #endif /* PF */
1603                                 if (pktcnt > ip_maxchainsent) {
1604                                         ip_maxchainsent = pktcnt;
1605                                 }
1606                                 if (ro->ro_rt != NULL && nstat_collect) {
1607                                         nstat_route_tx(ro->ro_rt, scnt,
1608                                             bytecnt, 0);
1609                                 }
1610
1611                                 error = dlil_output(ifp, PF_INET, packetlist,
1612                                     ro->ro_rt, SA(dst), 0, adv);
1613                                 if (dlil_verbose && error) {
1614                                         printf("dlil_output error on interface %s: %d\n",
1615                                             ifp->if_xname, error);
1616                                 }
1617                                 pktcnt = 0;
1618                                 scnt = 0;
1619                                 bytecnt = 0;
1620                                 goto done;
1621                         }
1622                         m0 = m;
1623                         pktcnt++;
1624                         goto loopit;
1625                 }
1626         }
1627
1628         VERIFY(interface_mtu != 0);
1629         /*
1630          * Too large for interface; fragment if possible.
1631          * Must be able to put at least 8 bytes per fragment.
1632          * Balk when DF bit is set or the interface didn't support TSO.
1633          */
1634         if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1635             (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1636                 error = EMSGSIZE;
1637                 /*
1638                  * This case can happen if the user changed the MTU
1639                  * of an interface after enabling IP on it.  Because
1640                  * most netifs don't keep track of routes pointing to
1641                  * them, there is no way for one to update all its
1642                  * routes when the MTU is changed.
1643                  */
1644                 if (ro->ro_rt) {
1645                         RT_LOCK_SPIN(ro->ro_rt);
1646                         if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1647                             !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1648                             (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1649                                 ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1650                         }
1651                         RT_UNLOCK(ro->ro_rt);
1652                 }
1653                 if (pktcnt > 0) {
1654                         m0 = packetlist;
1655                 }
1656                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1657                 goto bad;
1658         }
1659
1660         /*
1661          * XXX Only TCP seems to be passing a list of packets here.
1662          * The following issue is limited to UDP datagrams with 0 checksum.
1663          * For now limit it to the case when single packet is passed down.
1664          */
1665         if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
1666                 /*
1667                  * If it is a UDP packet that has checksum set to 0
1668                  * and is also not being offloaded, compute a full checksum
1669                  * and update the UDP checksum.
1670                  */
1671                 if (ip->ip_p == IPPROTO_UDP &&
1672                     !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
1673                         struct udphdr *uh = NULL;
1674
1675                         if (m->m_len < hlen + sizeof(struct udphdr)) {
1676                                 m = m_pullup(m, hlen + sizeof(struct udphdr));
1677                                 if (m == NULL) {
1678                                         error = ENOBUFS;
1679                                         m0 = m;
1680                                         goto bad;
1681                                 }
1682                                 m0 = m;
1683                                 ip = mtod(m, struct ip *);
1684                         }
1685                         /*
1686                          * Get UDP header and if checksum is 0, then compute the full
1687                          * checksum.
1688                          */
1689                         uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
1690                         if (uh->uh_sum == 0) {
1691                                 uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
1692                                     ip->ip_len - hlen);
1693                                 if (uh->uh_sum == 0) {
1694                                         uh->uh_sum = 0xffff;
1695                                 }
1696                         }
1697                 }
1698         }
1699
1700         error = ip_fragment(m, ifp, interface_mtu, sw_csum);
1701         if (error != 0) {
1702                 m0 = m = NULL;
1703                 goto bad;
1704         }
1705
1706         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1707             ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1708
1709         for (m = m0; m; m = m0) {
1710                 m0 = m->m_nextpkt;
1711                 m->m_nextpkt = 0;
1712 #if IPSEC
1713                 /* clean ipsec history once it goes out of the node */
1714                 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1715                         ipsec_delaux(m);
1716                 }
1717 #endif /* IPSEC */
1718                 if (error == 0) {
1719                         if ((packetchain != 0) && (pktcnt > 0)) {
1720                                 panic("%s: mix of packet in packetlist is "
1721                                     "wrong=%p", __func__, packetlist);
1722                                 /* NOTREACHED */
1723                         }
1724                         if (ro->ro_rt != NULL && nstat_collect) {
1725                                 nstat_route_tx(ro->ro_rt, 1,
1726                                     m->m_pkthdr.len, 0);
1727                         }
1728                         error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1729                             SA(dst), 0, adv);
1730                         if (dlil_verbose && error) {
1731                                 printf("dlil_output error on interface %s: %d\n",
1732                                     ifp->if_xname, error);
1733                         }
1734                 } else {
1735                         m_freem(m);
1736                 }
1737         }
1738
1739         if (error == 0) {
1740                 OSAddAtomic(1, &ipstat.ips_fragmented);
1741         }
1742
1743 done:
1744         if (ia != NULL) {
1745                 IFA_REMREF(&ia->ia_ifa);
1746                 ia = NULL;
1747         }
1748 #if IPSEC
1749         ROUTE_RELEASE(&ipsec_state.ro);
1750         if (sp != NULL) {
1751                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1752                     printf("DP ip_output call free SP:%x\n", sp));
1753                 key_freesp(sp, KEY_SADB_UNLOCKED);
1754         }
1755 #endif /* IPSEC */
1756 #if NECP
1757         ROUTE_RELEASE(&necp_route);
1758 #endif /* NECP */
1759 #if DUMMYNET
1760         ROUTE_RELEASE(&saved_route);
1761 #endif /* DUMMYNET */
1762
1763         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
1764         if (ip_output_measure) {
1765                 net_perf_measure_time(&net_perf, &start_tv, packets_processed);
1766                 net_perf_histogram(&net_perf, packets_processed);
1767         }
1768         return error;
1769 bad:
1770         if (pktcnt > 0) {
1771                 m0 = packetlist;
1772         }
1773         m_freem_list(m0);
1774         goto done;
1775
1776 #undef ipsec_state
1777 #undef args
1778 #undef sro_fwd
1779 #undef saved_route
1780 #undef ipf_pktopts
1781 #undef IP_CHECK_RESTRICTIONS
1782 }
1783
1784 int
1785 ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum)
1786 {
1787         struct ip *ip, *mhip;
1788         int len, hlen, mhlen, firstlen, off, error = 0;
1789         struct mbuf **mnext = &m->m_nextpkt, *m0;
1790         int nfrags = 1;
1791
1792         ip = mtod(m, struct ip *);
1793 #ifdef _IP_VHL
1794         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1795 #else /* !_IP_VHL */
1796         hlen = ip->ip_hl << 2;
1797 #endif /* !_IP_VHL */
1798
1799         /*
1800          * We need to adjust the fragment sizes to account
1801          * for IPv6 fragment header if it needs to be translated
1802          * from IPv4 to IPv6.
1803          */
1804         if (IS_INTF_CLAT46(ifp)) {
1805                 mtu -= sizeof(struct ip6_frag);
1806         }
1807
1808         firstlen = len = (mtu - hlen) & ~7;
1809         if (len < 8) {
1810                 m_freem(m);
1811                 return EMSGSIZE;
1812         }
1813
1814         /*
1815          * if the interface will not calculate checksums on
1816          * fragmented packets, then do it here.
1817          */
1818         if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
1819             !(ifp->if_hwassist & CSUM_IP_FRAGS)) {
1820                 in_delayed_cksum(m);
1821         }
1822
1823         /*
1824          * Loop through length of segment after first fragment,
1825          * make new header and copy data of each part and link onto chain.
1826          */
1827         m0 = m;
1828         mhlen = sizeof(struct ip);
1829         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1830                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1831                 if (m == NULL) {
1832                         error = ENOBUFS;
1833                         OSAddAtomic(1, &ipstat.ips_odropped);
1834                         goto sendorfree;
1835                 }
1836                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1837                 m->m_data += max_linkhdr;
1838                 mhip = mtod(m, struct ip *);
1839                 *mhip = *ip;
1840                 if (hlen > sizeof(struct ip)) {
1841                         mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
1842                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1843                 }
1844                 m->m_len = mhlen;
1845                 mhip->ip_off = (u_short)(((off - hlen) >> 3) + (ip->ip_off & ~IP_MF));
1846                 if (ip->ip_off & IP_MF) {
1847                         mhip->ip_off |= IP_MF;
1848                 }
1849                 if (off + len >= (u_short)ip->ip_len) {
1850                         len = (u_short)ip->ip_len - off;
1851                 } else {
1852                         mhip->ip_off |= IP_MF;
1853                 }
1854                 mhip->ip_len = htons((u_short)(len + mhlen));
1855                 m->m_next = m_copy(m0, off, len);
1856                 if (m->m_next == NULL) {
1857                         (void) m_free(m);
1858                         error = ENOBUFS;        /* ??? */
1859                         OSAddAtomic(1, &ipstat.ips_odropped);
1860                         goto sendorfree;
1861                 }
1862                 m->m_pkthdr.len = mhlen + len;
1863                 m->m_pkthdr.rcvif = NULL;
1864                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1865
1866                 M_COPY_CLASSIFIER(m, m0);
1867                 M_COPY_PFTAG(m, m0);
1868
1869 #if BYTE_ORDER != BIG_ENDIAN
1870                 HTONS(mhip->ip_off);
1871 #endif
1872
1873                 mhip->ip_sum = 0;
1874                 if (sw_csum & CSUM_DELAY_IP) {
1875                         mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
1876                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1877                 }
1878                 *mnext = m;
1879                 mnext = &m->m_nextpkt;
1880                 nfrags++;
1881         }
1882         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1883
1884         /* set first/last markers for fragment chain */
1885         m->m_flags |= M_LASTFRAG;
1886         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1887         m0->m_pkthdr.csum_data = nfrags;
1888
1889         /*
1890          * Update first fragment by trimming what's been copied out
1891          * and updating header, then send each fragment (in order).
1892          */
1893         m = m0;
1894         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1895         m->m_pkthdr.len = hlen + firstlen;
1896         ip->ip_len = htons((u_short)m->m_pkthdr.len);
1897         ip->ip_off |= IP_MF;
1898
1899 #if BYTE_ORDER != BIG_ENDIAN
1900         HTONS(ip->ip_off);
1901 #endif
1902
1903         ip->ip_sum = 0;
1904         if (sw_csum & CSUM_DELAY_IP) {
1905                 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1906                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1907         }
1908 sendorfree:
1909         if (error) {
1910                 m_freem_list(m0);
1911         }
1912
1913         return error;
1914 }
1915
1916 static void
1917 ip_out_cksum_stats(int proto, u_int32_t len)
1918 {
1919         switch (proto) {
1920         case IPPROTO_TCP:
1921                 tcp_out_cksum_stats(len);
1922                 break;
1923         case IPPROTO_UDP:
1924                 udp_out_cksum_stats(len);
1925                 break;
1926         default:
1927                 /* keep only TCP or UDP stats for now */
1928                 break;
1929         }
1930 }
1931
1932 /*
1933  * Process a delayed payload checksum calculation (outbound path.)
1934  *
1935  * hoff is the number of bytes beyond the mbuf data pointer which
1936  * points to the IP header.
1937  *
1938  * Returns a bitmask representing all the work done in software.
1939  */
1940 uint32_t
1941 in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
1942 {
1943         unsigned char buf[15 << 2] __attribute__((aligned(8)));
1944         struct ip *ip;
1945         uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
1946         uint16_t csum, ip_len;
1947
1948         _CASSERT(sizeof(csum) == sizeof(uint16_t));
1949         VERIFY(m->m_flags & M_PKTHDR);
1950
1951         sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
1952
1953         if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) {
1954                 goto done;
1955         }
1956
1957         mlen = m->m_pkthdr.len;                         /* total mbuf len */
1958
1959         /* sanity check (need at least simple IP header) */
1960         if (mlen < (hoff + sizeof(*ip))) {
1961                 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
1962                     "(%u+%u)\n", __func__, m, mlen, hoff,
1963                     (uint32_t)sizeof(*ip));
1964                 /* NOTREACHED */
1965         }
1966
1967         /*
1968          * In case the IP header is not contiguous, or not 32-bit aligned,
1969          * or if we're computing the IP header checksum, copy it to a local
1970          * buffer.  Copy only the simple IP header here (IP options case
1971          * is handled below.)
1972          */
1973         if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof(*ip)) > m->m_len ||
1974             !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
1975                 m_copydata(m, hoff, sizeof(*ip), (caddr_t)buf);
1976                 ip = (struct ip *)(void *)buf;
1977                 _hlen = sizeof(*ip);
1978         } else {
1979                 ip = (struct ip *)(void *)(m->m_data + hoff);
1980                 _hlen = 0;
1981         }
1982
1983         hlen = IP_VHL_HL(ip->ip_vhl) << 2;              /* IP header len */
1984
1985         /* sanity check */
1986         if (mlen < (hoff + hlen)) {
1987                 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
1988                     "hoff %u", __func__, m, mlen, hlen, hoff);
1989                 /* NOTREACHED */
1990         }
1991
1992         /*
1993          * We could be in the context of an IP or interface filter; in the
1994          * former case, ip_len would be in host (correct) order while for
1995          * the latter it would be in network order.  Because of this, we
1996          * attempt to interpret the length field by comparing it against
1997          * the actual packet length.  If the comparison fails, byte swap
1998          * the length and check again.  If it still fails, use the actual
1999          * packet length.  This also covers the trailing bytes case.
2000          */
2001         ip_len = ip->ip_len;
2002         if (ip_len != (mlen - hoff)) {
2003                 ip_len = OSSwapInt16(ip_len);
2004                 if (ip_len != (mlen - hoff)) {
2005                         printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2006                             "[swapped %d (%x)] doesn't match actual packet "
2007                             "length; %d is used instead\n", __func__,
2008                             (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p,
2009                             ip->ip_len, ip->ip_len, ip_len, ip_len,
2010                             (mlen - hoff));
2011                         if (mlen - hoff > UINT16_MAX) {
2012                                 panic("%s: mlen %u - hoff %u > 65535",
2013                                     __func__, mlen, hoff);
2014                         }
2015                         ip_len = (uint16_t)(mlen - hoff);
2016                 }
2017         }
2018
2019         len = ip_len - hlen;                            /* csum span */
2020
2021         if (sw_csum & CSUM_DELAY_DATA) {
2022                 uint16_t ulpoff;
2023
2024                 /*
2025                  * offset is added to the lower 16-bit value of csum_data,
2026                  * which is expected to contain the ULP offset; therefore
2027                  * CSUM_PARTIAL offset adjustment must be undone.
2028                  */
2029                 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL | CSUM_DATA_VALID)) ==
2030                     (CSUM_PARTIAL | CSUM_DATA_VALID)) {
2031                         /*
2032                          * Get back the original ULP offset (this will
2033                          * undo the CSUM_PARTIAL logic in ip_output.)
2034                          */
2035                         m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2036                             m->m_pkthdr.csum_tx_start);
2037                 }
2038
2039                 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2040                 offset = hoff + hlen;                   /* ULP header */
2041
2042                 if (mlen < (ulpoff + sizeof(csum))) {
2043                         panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2044                             "cksum offset (%u) cksum flags 0x%x\n", __func__,
2045                             m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2046                         /* NOTREACHED */
2047                 }
2048
2049                 csum = inet_cksum(m, 0, offset, len);
2050
2051                 /* Update stats */
2052                 ip_out_cksum_stats(ip->ip_p, len);
2053
2054                 /* RFC1122 4.1.3.4 */
2055                 if (csum == 0 &&
2056                     (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_ZERO_INVERT))) {
2057                         csum = 0xffff;
2058                 }
2059
2060                 /* Insert the checksum in the ULP csum field */
2061                 offset += ulpoff;
2062                 if (offset + sizeof(csum) > m->m_len) {
2063                         m_copyback(m, offset, sizeof(csum), &csum);
2064                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2065                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2066                 } else {
2067                         bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2068                 }
2069                 m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2070                     CSUM_PARTIAL | CSUM_ZERO_INVERT);
2071         }
2072
2073         if (sw_csum & CSUM_DELAY_IP) {
2074                 /* IP header must be in the local buffer */
2075                 VERIFY(_hlen == sizeof(*ip));
2076                 if (_hlen != hlen) {
2077                         VERIFY(hlen <= sizeof(buf));
2078                         m_copydata(m, hoff, hlen, (caddr_t)buf);
2079                         ip = (struct ip *)(void *)buf;
2080                         _hlen = hlen;
2081                 }
2082
2083                 /*
2084                  * Compute the IP header checksum as if the IP length
2085                  * is the length which we believe is "correct"; see
2086                  * how ip_len gets calculated above.  Note that this
2087                  * is done on the local copy and not on the real one.
2088                  */
2089                 ip->ip_len = htons(ip_len);
2090                 ip->ip_sum = 0;
2091                 csum = in_cksum_hdr_opt(ip);
2092
2093                 /* Update stats */
2094                 ipstat.ips_snd_swcsum++;
2095                 ipstat.ips_snd_swcsum_bytes += hlen;
2096
2097                 /*
2098                  * Insert only the checksum in the existing IP header
2099                  * csum field; all other fields are left unchanged.
2100                  */
2101                 offset = hoff + offsetof(struct ip, ip_sum);
2102                 if (offset + sizeof(csum) > m->m_len) {
2103                         m_copyback(m, offset, sizeof(csum), &csum);
2104                 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2105                         *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2106                 } else {
2107                         bcopy(&csum, (mtod(m, char *) + offset), sizeof(csum));
2108                 }
2109                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2110         }
2111
2112 done:
2113         return sw_csum;
2114 }
2115
2116 /*
2117  * Insert IP options into preformed packet.
2118  * Adjust IP destination as required for IP source routing,
2119  * as indicated by a non-zero in_addr at the start of the options.
2120  *
2121  * XXX This routine assumes that the packet has no options in place.
2122  */
2123 static struct mbuf *
2124 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2125 {
2126         struct ipoption *p = mtod(opt, struct ipoption *);
2127         struct mbuf *n;
2128         struct ip *ip = mtod(m, struct ip *);
2129         unsigned optlen;
2130
2131         optlen = opt->m_len - sizeof(p->ipopt_dst);
2132         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
2133                 return m;             /* XXX should fail */
2134         }
2135         if (p->ipopt_dst.s_addr) {
2136                 ip->ip_dst = p->ipopt_dst;
2137         }
2138         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2139                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
2140                 if (n == NULL) {
2141                         return m;
2142                 }
2143                 n->m_pkthdr.rcvif = 0;
2144                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2145                 m->m_len -= sizeof(struct ip);
2146                 m->m_data += sizeof(struct ip);
2147                 n->m_next = m;
2148                 m = n;
2149                 m->m_len = optlen + sizeof(struct ip);
2150                 m->m_data += max_linkhdr;
2151                 (void) memcpy(mtod(m, void *), ip, sizeof(struct ip));
2152         } else {
2153                 m->m_data -= optlen;
2154                 m->m_len += optlen;
2155                 m->m_pkthdr.len += optlen;
2156                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2157         }
2158         ip = mtod(m, struct ip *);
2159         bcopy(p->ipopt_list, ip + 1, optlen);
2160         *phlen = sizeof(struct ip) + optlen;
2161         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2162         ip->ip_len += optlen;
2163         return m;
2164 }
2165
2166 /*
2167  * Copy options from ip to jp,
2168  * omitting those not copied during fragmentation.
2169  */
2170 static int
2171 ip_optcopy(struct ip *ip, struct ip *jp)
2172 {
2173         u_char *cp, *dp;
2174         int opt, optlen, cnt;
2175
2176         cp = (u_char *)(ip + 1);
2177         dp = (u_char *)(jp + 1);
2178         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2179         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2180                 opt = cp[0];
2181                 if (opt == IPOPT_EOL) {
2182                         break;
2183                 }
2184                 if (opt == IPOPT_NOP) {
2185                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2186                         *dp++ = IPOPT_NOP;
2187                         optlen = 1;
2188                         continue;
2189                 }
2190 #if DIAGNOSTIC
2191                 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2192                         panic("malformed IPv4 option passed to ip_optcopy");
2193                         /* NOTREACHED */
2194                 }
2195 #endif
2196                 optlen = cp[IPOPT_OLEN];
2197 #if DIAGNOSTIC
2198                 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2199                         panic("malformed IPv4 option passed to ip_optcopy");
2200                         /* NOTREACHED */
2201                 }
2202 #endif
2203                 /* bogus lengths should have been caught by ip_dooptions */
2204                 if (optlen > cnt) {
2205                         optlen = cnt;
2206                 }
2207                 if (IPOPT_COPIED(opt)) {
2208                         bcopy(cp, dp, optlen);
2209                         dp += optlen;
2210                 }
2211         }
2212         for (optlen = (int)(dp - (u_char *)(jp + 1)); optlen & 0x3; optlen++) {
2213                 *dp++ = IPOPT_EOL;
2214         }
2215         return optlen;
2216 }
2217
2218 /*
2219  * IP socket option processing.
2220  */
2221 int
2222 ip_ctloutput(struct socket *so, struct sockopt *sopt)
2223 {
2224         struct  inpcb *inp = sotoinpcb(so);
2225         int     error, optval;
2226         lck_mtx_t *mutex_held = NULL;
2227
2228         error = optval = 0;
2229         if (sopt->sopt_level != IPPROTO_IP) {
2230                 return EINVAL;
2231         }
2232
2233         switch (sopt->sopt_dir) {
2234         case SOPT_SET:
2235                 mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
2236                 /*
2237                  *  Wait if we are in the middle of ip_output
2238                  *  as we unlocked the socket there and don't
2239                  *  want to overwrite the IP options
2240                  */
2241                 if (inp->inp_sndinprog_cnt > 0) {
2242                         inp->inp_sndingprog_waiters++;
2243
2244                         while (inp->inp_sndinprog_cnt > 0) {
2245                                 msleep(&inp->inp_sndinprog_cnt, mutex_held,
2246                                     PSOCK | PCATCH, "inp_sndinprog_cnt", NULL);
2247                         }
2248                         inp->inp_sndingprog_waiters--;
2249                 }
2250                 switch (sopt->sopt_name) {
2251 #ifdef notyet
2252                 case IP_RETOPTS:
2253 #endif
2254                 case IP_OPTIONS: {
2255                         struct mbuf *m;
2256
2257                         if (sopt->sopt_valsize > MLEN) {
2258                                 error = EMSGSIZE;
2259                                 break;
2260                         }
2261                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2262                             MT_HEADER);
2263                         if (m == NULL) {
2264                                 error = ENOBUFS;
2265                                 break;
2266                         }
2267                         m->m_len = (int32_t)sopt->sopt_valsize;
2268                         error = sooptcopyin(sopt, mtod(m, char *),
2269                             m->m_len, m->m_len);
2270                         if (error) {
2271                                 m_freem(m);
2272                                 break;
2273                         }
2274
2275                         return ip_pcbopts(sopt->sopt_name,
2276                                    &inp->inp_options, m);
2277                 }
2278
2279                 case IP_TOS:
2280                 case IP_TTL:
2281                 case IP_RECVOPTS:
2282                 case IP_RECVRETOPTS:
2283                 case IP_RECVDSTADDR:
2284                 case IP_RECVIF:
2285                 case IP_RECVTTL:
2286                 case IP_RECVPKTINFO:
2287                 case IP_RECVTOS:
2288                 case IP_DONTFRAG:
2289                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2290                             sizeof(optval));
2291                         if (error) {
2292                                 break;
2293                         }
2294
2295                         switch (sopt->sopt_name) {
2296                         case IP_TOS:
2297                                 if (optval > UINT8_MAX) {
2298                                         error = EINVAL;
2299                                         break;
2300                                 }
2301                                 inp->inp_ip_tos = (uint8_t)optval;
2302                                 break;
2303
2304                         case IP_TTL:
2305                                 if (optval > UINT8_MAX) {
2306                                         error = EINVAL;
2307                                         break;
2308                                 }
2309                                 inp->inp_ip_ttl = (uint8_t)optval;
2310                                 break;
2311 #define OPTSET(bit) do {                                                \
2312         if (optval) {                                                   \
2313             inp->inp_flags |= bit;                                      \
2314         } else {                                                        \
2315             inp->inp_flags &= ~bit;                                     \
2316         }                                                               \
2317 } while (0)
2318
2319 #define OPTSET2(bit) do {                                               \
2320         if (optval) {                                                   \
2321             inp->inp_flags2 |= bit;                                     \
2322         } else {                                                        \
2323             inp->inp_flags2 &= ~bit;                                    \
2324         }                                                               \
2325 } while (0)
2326
2327                         case IP_RECVOPTS:
2328                                 OPTSET(INP_RECVOPTS);
2329                                 break;
2330
2331                         case IP_RECVRETOPTS:
2332                                 OPTSET(INP_RECVRETOPTS);
2333                                 break;
2334
2335                         case IP_RECVDSTADDR:
2336                                 OPTSET(INP_RECVDSTADDR);
2337                                 break;
2338
2339                         case IP_RECVIF:
2340                                 OPTSET(INP_RECVIF);
2341                                 break;
2342
2343                         case IP_RECVTTL:
2344                                 OPTSET(INP_RECVTTL);
2345                                 break;
2346
2347                         case IP_RECVPKTINFO:
2348                                 OPTSET(INP_PKTINFO);
2349                                 break;
2350
2351                         case IP_RECVTOS:
2352                                 OPTSET(INP_RECVTOS);
2353                                 break;
2354
2355                         case IP_DONTFRAG:
2356                                 /* This option is settable only for IPv4 */
2357                                 if (!(inp->inp_vflag & INP_IPV4)) {
2358                                         error = EINVAL;
2359                                         break;
2360                                 }
2361                                 OPTSET2(INP2_DONTFRAG);
2362                                 break;
2363 #undef OPTSET
2364 #undef OPTSET2
2365                         }
2366                         break;
2367                 /*
2368                  * Multicast socket options are processed by the in_mcast
2369                  * module.
2370                  */
2371                 case IP_MULTICAST_IF:
2372                 case IP_MULTICAST_IFINDEX:
2373                 case IP_MULTICAST_VIF:
2374                 case IP_MULTICAST_TTL:
2375                 case IP_MULTICAST_LOOP:
2376                 case IP_ADD_MEMBERSHIP:
2377                 case IP_DROP_MEMBERSHIP:
2378                 case IP_ADD_SOURCE_MEMBERSHIP:
2379                 case IP_DROP_SOURCE_MEMBERSHIP:
2380                 case IP_BLOCK_SOURCE:
2381                 case IP_UNBLOCK_SOURCE:
2382                 case IP_MSFILTER:
2383                 case MCAST_JOIN_GROUP:
2384                 case MCAST_LEAVE_GROUP:
2385                 case MCAST_JOIN_SOURCE_GROUP:
2386                 case MCAST_LEAVE_SOURCE_GROUP:
2387                 case MCAST_BLOCK_SOURCE:
2388                 case MCAST_UNBLOCK_SOURCE:
2389                         error = inp_setmoptions(inp, sopt);
2390                         break;
2391
2392                 case IP_PORTRANGE:
2393                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2394                             sizeof(optval));
2395                         if (error) {
2396                                 break;
2397                         }
2398
2399                         switch (optval) {
2400                         case IP_PORTRANGE_DEFAULT:
2401                                 inp->inp_flags &= ~(INP_LOWPORT);
2402                                 inp->inp_flags &= ~(INP_HIGHPORT);
2403                                 break;
2404
2405                         case IP_PORTRANGE_HIGH:
2406                                 inp->inp_flags &= ~(INP_LOWPORT);
2407                                 inp->inp_flags |= INP_HIGHPORT;
2408                                 break;
2409
2410                         case IP_PORTRANGE_LOW:
2411                                 inp->inp_flags &= ~(INP_HIGHPORT);
2412                                 inp->inp_flags |= INP_LOWPORT;
2413                                 break;
2414
2415                         default:
2416                                 error = EINVAL;
2417                                 break;
2418                         }
2419                         break;
2420
2421 #if IPSEC
2422                 case IP_IPSEC_POLICY: {
2423                         caddr_t req = NULL;
2424                         size_t len = 0;
2425                         int priv;
2426                         struct mbuf *m;
2427                         int optname;
2428
2429                         if ((error = soopt_getm(sopt, &m)) != 0) { /* XXX */
2430                                 break;
2431                         }
2432                         if ((error = soopt_mcopyin(sopt, m)) != 0) { /* XXX */
2433                                 break;
2434                         }
2435                         priv = (proc_suser(sopt->sopt_p) == 0);
2436                         if (m) {
2437                                 req = mtod(m, caddr_t);
2438                                 len = m->m_len;
2439                         }
2440                         optname = sopt->sopt_name;
2441                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2442                         m_freem(m);
2443                         break;
2444                 }
2445 #endif /* IPSEC */
2446
2447 #if TRAFFIC_MGT
2448                 case IP_TRAFFIC_MGT_BACKGROUND: {
2449                         unsigned background = 0;
2450
2451                         error = sooptcopyin(sopt, &background,
2452                             sizeof(background), sizeof(background));
2453                         if (error) {
2454                                 break;
2455                         }
2456
2457                         if (background) {
2458                                 socket_set_traffic_mgt_flags_locked(so,
2459                                     TRAFFIC_MGT_SO_BACKGROUND);
2460                         } else {
2461                                 socket_clear_traffic_mgt_flags_locked(so,
2462                                     TRAFFIC_MGT_SO_BACKGROUND);
2463                         }
2464
2465                         break;
2466                 }
2467 #endif /* TRAFFIC_MGT */
2468
2469                 /*
2470                  * On a multihomed system, scoped routing can be used to
2471                  * restrict the source interface used for sending packets.
2472                  * The socket option IP_BOUND_IF binds a particular AF_INET
2473                  * socket to an interface such that data sent on the socket
2474                  * is restricted to that interface.  This is unlike the
2475                  * SO_DONTROUTE option where the routing table is bypassed;
2476                  * therefore it allows for a greater flexibility and control
2477                  * over the system behavior, and does not place any restriction
2478                  * on the destination address type (e.g.  unicast, multicast,
2479                  * or broadcast if applicable) or whether or not the host is
2480                  * directly reachable.  Note that in the multicast transmit
2481                  * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2482                  * IP_BOUND_IF, since the former practically bypasses the
2483                  * routing table; in this case, IP_BOUND_IF sets the default
2484                  * interface used for sending multicast packets in the absence
2485                  * of an explicit multicast transmit interface.
2486                  */
2487                 case IP_BOUND_IF:
2488                         /* This option is settable only for IPv4 */
2489                         if (!(inp->inp_vflag & INP_IPV4)) {
2490                                 error = EINVAL;
2491                                 break;
2492                         }
2493
2494                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2495                             sizeof(optval));
2496
2497                         if (error) {
2498                                 break;
2499                         }
2500
2501                         error = inp_bindif(inp, optval, NULL);
2502                         break;
2503
2504                 case IP_NO_IFT_CELLULAR:
2505                         /* This option is settable only for IPv4 */
2506                         if (!(inp->inp_vflag & INP_IPV4)) {
2507                                 error = EINVAL;
2508                                 break;
2509                         }
2510
2511                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2512                             sizeof(optval));
2513
2514                         if (error) {
2515                                 break;
2516                         }
2517
2518                         /* once set, it cannot be unset */
2519                         if (!optval && INP_NO_CELLULAR(inp)) {
2520                                 error = EINVAL;
2521                                 break;
2522                         }
2523
2524                         error = so_set_restrictions(so,
2525                             SO_RESTRICT_DENY_CELLULAR);
2526                         break;
2527
2528                 case IP_OUT_IF:
2529                         /* This option is not settable */
2530                         error = EINVAL;
2531                         break;
2532
2533                 default:
2534                         error = ENOPROTOOPT;
2535                         break;
2536                 }
2537                 break;
2538
2539         case SOPT_GET:
2540                 switch (sopt->sopt_name) {
2541                 case IP_OPTIONS:
2542                 case IP_RETOPTS:
2543                         if (inp->inp_options) {
2544                                 error = sooptcopyout(sopt,
2545                                     mtod(inp->inp_options, char *),
2546                                     inp->inp_options->m_len);
2547                         } else {
2548                                 sopt->sopt_valsize = 0;
2549                         }
2550                         break;
2551
2552                 case IP_TOS:
2553                 case IP_TTL:
2554                 case IP_RECVOPTS:
2555                 case IP_RECVRETOPTS:
2556                 case IP_RECVDSTADDR:
2557                 case IP_RECVIF:
2558                 case IP_RECVTTL:
2559                 case IP_PORTRANGE:
2560                 case IP_RECVPKTINFO:
2561                 case IP_RECVTOS:
2562                 case IP_DONTFRAG:
2563                         switch (sopt->sopt_name) {
2564                         case IP_TOS:
2565                                 optval = inp->inp_ip_tos;
2566                                 break;
2567
2568                         case IP_TTL:
2569                                 optval = inp->inp_ip_ttl;
2570                                 break;
2571
2572 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2573 #define OPTBIT2(bit)    (inp->inp_flags2 & bit ? 1 : 0)
2574                         case IP_RECVOPTS:
2575                                 optval = OPTBIT(INP_RECVOPTS);
2576                                 break;
2577
2578                         case IP_RECVRETOPTS:
2579                                 optval = OPTBIT(INP_RECVRETOPTS);
2580                                 break;
2581
2582                         case IP_RECVDSTADDR:
2583                                 optval = OPTBIT(INP_RECVDSTADDR);
2584                                 break;
2585
2586                         case IP_RECVIF:
2587                                 optval = OPTBIT(INP_RECVIF);
2588                                 break;
2589
2590                         case IP_RECVTTL:
2591                                 optval = OPTBIT(INP_RECVTTL);
2592                                 break;
2593
2594                         case IP_PORTRANGE:
2595                                 if (inp->inp_flags & INP_HIGHPORT) {
2596                                         optval = IP_PORTRANGE_HIGH;
2597                                 } else if (inp->inp_flags & INP_LOWPORT) {
2598                                         optval = IP_PORTRANGE_LOW;
2599                                 } else {
2600                                         optval = 0;
2601                                 }
2602                                 break;
2603
2604                         case IP_RECVPKTINFO:
2605                                 optval = OPTBIT(INP_PKTINFO);
2606                                 break;
2607
2608                         case IP_RECVTOS:
2609                                 optval = OPTBIT(INP_RECVTOS);
2610                                 break;
2611                         case IP_DONTFRAG:
2612                                 optval = OPTBIT2(INP2_DONTFRAG);
2613                                 break;
2614                         }
2615                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2616                         break;
2617
2618                 case IP_MULTICAST_IF:
2619                 case IP_MULTICAST_IFINDEX:
2620                 case IP_MULTICAST_VIF:
2621                 case IP_MULTICAST_TTL:
2622                 case IP_MULTICAST_LOOP:
2623                 case IP_MSFILTER:
2624                         error = inp_getmoptions(inp, sopt);
2625                         break;
2626
2627 #if IPSEC
2628                 case IP_IPSEC_POLICY: {
2629                         error = 0; /* This option is no longer supported */
2630                         break;
2631                 }
2632 #endif /* IPSEC */
2633
2634 #if TRAFFIC_MGT
2635                 case IP_TRAFFIC_MGT_BACKGROUND: {
2636                         unsigned background = (so->so_flags1 &
2637                             SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2638                         return sooptcopyout(sopt, &background,
2639                                    sizeof(background));
2640                 }
2641 #endif /* TRAFFIC_MGT */
2642
2643                 case IP_BOUND_IF:
2644                         if (inp->inp_flags & INP_BOUND_IF) {
2645                                 optval = inp->inp_boundifp->if_index;
2646                         }
2647                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2648                         break;
2649
2650                 case IP_NO_IFT_CELLULAR:
2651                         optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2652                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2653                         break;
2654
2655                 case IP_OUT_IF:
2656                         optval = (inp->inp_last_outifp != NULL) ?
2657                             inp->inp_last_outifp->if_index : 0;
2658                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2659                         break;
2660
2661                 default:
2662                         error = ENOPROTOOPT;
2663                         break;
2664                 }
2665                 break;
2666         }
2667         return error;
2668 }
2669
2670 /*
2671  * Set up IP options in pcb for insertion in output packets.
2672  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2673  * with destination address if source routed.
2674  */
2675 static int
2676 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2677 {
2678 #pragma unused(optname)
2679         int cnt, optlen;
2680         u_char *cp;
2681         u_char opt;
2682
2683         /* turn off any old options */
2684         if (*pcbopt) {
2685                 (void) m_free(*pcbopt);
2686         }
2687         *pcbopt = 0;
2688         if (m == (struct mbuf *)0 || m->m_len == 0) {
2689                 /*
2690                  * Only turning off any previous options.
2691                  */
2692                 if (m) {
2693                         (void) m_free(m);
2694                 }
2695                 return 0;
2696         }
2697
2698         if (m->m_len % sizeof(int32_t)) {
2699                 goto bad;
2700         }
2701
2702         /*
2703          * IP first-hop destination address will be stored before
2704          * actual options; move other options back
2705          * and clear it when none present.
2706          */
2707         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) {
2708                 goto bad;
2709         }
2710         cnt = m->m_len;
2711         m->m_len += sizeof(struct in_addr);
2712         cp = mtod(m, u_char *) + sizeof(struct in_addr);
2713         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2714         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2715
2716         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2717                 opt = cp[IPOPT_OPTVAL];
2718                 if (opt == IPOPT_EOL) {
2719                         break;
2720                 }
2721                 if (opt == IPOPT_NOP) {
2722                         optlen = 1;
2723                 } else {
2724                         if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2725                                 goto bad;
2726                         }
2727                         optlen = cp[IPOPT_OLEN];
2728                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2729                                 goto bad;
2730                         }
2731                 }
2732                 switch (opt) {
2733                 default:
2734                         break;
2735
2736                 case IPOPT_LSRR:
2737                 case IPOPT_SSRR:
2738                         /*
2739                          * user process specifies route as:
2740                          *      ->A->B->C->D
2741                          * D must be our final destination (but we can't
2742                          * check that since we may not have connected yet).
2743                          * A is first hop destination, which doesn't appear in
2744                          * actual IP option, but is stored before the options.
2745                          */
2746                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) {
2747                                 goto bad;
2748                         }
2749                         if (optlen > UINT8_MAX) {
2750                                 goto bad;
2751                         }
2752                         m->m_len -= sizeof(struct in_addr);
2753                         cnt -= sizeof(struct in_addr);
2754                         optlen -= sizeof(struct in_addr);
2755                         cp[IPOPT_OLEN] = (uint8_t)optlen;
2756                         /*
2757                          * Move first hop before start of options.
2758                          */
2759                         bcopy((caddr_t)&cp[IPOPT_OFFSET + 1], mtod(m, caddr_t),
2760                             sizeof(struct in_addr));
2761                         /*
2762                          * Then copy rest of options back
2763                          * to close up the deleted entry.
2764                          */
2765                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET + 1] +
2766                             sizeof(struct in_addr)),
2767                             (caddr_t)&cp[IPOPT_OFFSET + 1],
2768                             (unsigned)cnt - (IPOPT_MINOFF - 1));
2769                         break;
2770                 }
2771         }
2772         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) {
2773                 goto bad;
2774         }
2775         *pcbopt = m;
2776         return 0;
2777
2778 bad:
2779         (void) m_free(m);
2780         return EINVAL;
2781 }
2782
2783 void
2784 ip_moptions_init(void)
2785 {
2786         PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof(imo_debug));
2787
2788         vm_size_t imo_size = (imo_debug == 0) ? sizeof(struct ip_moptions) :
2789             sizeof(struct ip_moptions_dbg);
2790
2791         imo_zone = zone_create(IMO_ZONE_NAME, imo_size, ZC_ZFREE_CLEARMEM);
2792 }
2793
2794 void
2795 imo_addref(struct ip_moptions *imo, int locked)
2796 {
2797         if (!locked) {
2798                 IMO_LOCK(imo);
2799         } else {
2800                 IMO_LOCK_ASSERT_HELD(imo);
2801         }
2802
2803         if (++imo->imo_refcnt == 0) {
2804                 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
2805                 /* NOTREACHED */
2806         } else if (imo->imo_trace != NULL) {
2807                 (*imo->imo_trace)(imo, TRUE);
2808         }
2809
2810         if (!locked) {
2811                 IMO_UNLOCK(imo);
2812         }
2813 }
2814
2815 void
2816 imo_remref(struct ip_moptions *imo)
2817 {
2818         int i;
2819
2820         IMO_LOCK(imo);
2821         if (imo->imo_refcnt == 0) {
2822                 panic("%s: imo %p negative refcnt", __func__, imo);
2823                 /* NOTREACHED */
2824         } else if (imo->imo_trace != NULL) {
2825                 (*imo->imo_trace)(imo, FALSE);
2826         }
2827
2828         --imo->imo_refcnt;
2829         if (imo->imo_refcnt > 0) {
2830                 IMO_UNLOCK(imo);
2831                 return;
2832         }
2833
2834         for (i = 0; i < imo->imo_num_memberships; ++i) {
2835                 struct in_mfilter *imf;
2836
2837                 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
2838                 if (imf != NULL) {
2839                         imf_leave(imf);
2840                 }
2841
2842                 (void) in_leavegroup(imo->imo_membership[i], imf);
2843
2844                 if (imf != NULL) {
2845                         imf_purge(imf);
2846                 }
2847
2848                 INM_REMREF(imo->imo_membership[i]);
2849                 imo->imo_membership[i] = NULL;
2850         }
2851         imo->imo_num_memberships = 0;
2852         if (imo->imo_mfilters != NULL) {
2853                 FREE(imo->imo_mfilters, M_INMFILTER);
2854                 imo->imo_mfilters = NULL;
2855         }
2856         if (imo->imo_membership != NULL) {
2857                 FREE(imo->imo_membership, M_IPMOPTS);
2858                 imo->imo_membership = NULL;
2859         }
2860         IMO_UNLOCK(imo);
2861
2862         lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
2863
2864         if (!(imo->imo_debug & IFD_ALLOC)) {
2865                 panic("%s: imo %p cannot be freed", __func__, imo);
2866                 /* NOTREACHED */
2867         }
2868         zfree(imo_zone, imo);
2869 }
2870
2871 static void
2872 imo_trace(struct ip_moptions *imo, int refhold)
2873 {
2874         struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2875         ctrace_t *tr;
2876         u_int32_t idx;
2877         u_int16_t *cnt;
2878
2879         if (!(imo->imo_debug & IFD_DEBUG)) {
2880                 panic("%s: imo %p has no debug structure", __func__, imo);
2881                 /* NOTREACHED */
2882         }
2883         if (refhold) {
2884                 cnt = &imo_dbg->imo_refhold_cnt;
2885                 tr = imo_dbg->imo_refhold;
2886         } else {
2887                 cnt = &imo_dbg->imo_refrele_cnt;
2888                 tr = imo_dbg->imo_refrele;
2889         }
2890
2891         idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
2892         ctrace_record(&tr[idx]);
2893 }
2894
2895 struct ip_moptions *
2896 ip_allocmoptions(zalloc_flags_t how)
2897 {
2898         struct ip_moptions *imo;
2899
2900         imo = zalloc_flags(imo_zone, how | Z_ZERO);
2901         if (imo != NULL) {
2902                 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
2903                 imo->imo_debug |= IFD_ALLOC;
2904                 if (imo_debug != 0) {
2905                         imo->imo_debug |= IFD_DEBUG;
2906                         imo->imo_trace = imo_trace;
2907                 }
2908                 IMO_ADDREF(imo);
2909         }
2910
2911         return imo;
2912 }
2913
2914 /*
2915  * Routine called from ip_output() to loop back a copy of an IP multicast
2916  * packet to the input queue of a specified interface.  Note that this
2917  * calls the output routine of the loopback "driver", but with an interface
2918  * pointer that might NOT be a loopback interface -- evil, but easier than
2919  * replicating that code here.
2920  */
2921 static void
2922 ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
2923     struct sockaddr_in *dst, int hlen)
2924 {
2925         struct mbuf *copym;
2926         struct ip *ip;
2927
2928         if (lo_ifp == NULL) {
2929                 return;
2930         }
2931
2932         /*
2933          * Copy the packet header as it's needed for the checksum
2934          * Make sure to deep-copy IP header portion in case the data
2935          * is in an mbuf cluster, so that we can safely override the IP
2936          * header portion later.
2937          */
2938         copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR);
2939         if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) {
2940                 copym = m_pullup(copym, hlen);
2941         }
2942
2943         if (copym == NULL) {
2944                 return;
2945         }
2946
2947         /*
2948          * We don't bother to fragment if the IP length is greater
2949          * than the interface's MTU.  Can this possibly matter?
2950          */
2951         ip = mtod(copym, struct ip *);
2952 #if BYTE_ORDER != BIG_ENDIAN
2953         HTONS(ip->ip_len);
2954         HTONS(ip->ip_off);
2955 #endif
2956         ip->ip_sum = 0;
2957         ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
2958
2959         /*
2960          * Mark checksum as valid unless receive checksum offload is
2961          * disabled; if so, compute checksum in software.  If the
2962          * interface itself is lo0, this will be overridden by if_loop.
2963          */
2964         if (hwcksum_rx) {
2965                 copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL | CSUM_ZERO_INVERT);
2966                 copym->m_pkthdr.csum_flags |=
2967                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2968                 copym->m_pkthdr.csum_data = 0xffff;
2969         } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2970 #if BYTE_ORDER != BIG_ENDIAN
2971                 NTOHS(ip->ip_len);
2972 #endif
2973                 in_delayed_cksum(copym);
2974 #if BYTE_ORDER != BIG_ENDIAN
2975                 HTONS(ip->ip_len);
2976 #endif
2977         }
2978
2979         /*
2980          * Stuff the 'real' ifp into the pkthdr, to be used in matching
2981          * in ip_input(); we need the loopback ifp/dl_tag passed as args
2982          * to make the loopback driver compliant with the data link
2983          * requirements.
2984          */
2985         copym->m_pkthdr.rcvif = origifp;
2986
2987         /*
2988          * Also record the source interface (which owns the source address).
2989          * This is basically a stripped down version of ifa_foraddr().
2990          */
2991         if (srcifp == NULL) {
2992                 struct in_ifaddr *ia;
2993
2994                 lck_rw_lock_shared(in_ifaddr_rwlock);
2995                 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
2996                         IFA_LOCK_SPIN(&ia->ia_ifa);
2997                         if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
2998                                 srcifp = ia->ia_ifp;
2999                                 IFA_UNLOCK(&ia->ia_ifa);
3000                                 break;
3001                         }
3002                         IFA_UNLOCK(&ia->ia_ifa);
3003                 }
3004                 lck_rw_done(in_ifaddr_rwlock);
3005         }
3006         if (srcifp != NULL) {
3007                 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3008         }
3009         ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3010
3011         dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3012 }
3013
3014 /*
3015  * Given a source IP address (and route, if available), determine the best
3016  * interface to send the packet from.  Checking for (and updating) the
3017  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3018  * without any locks based on the assumption that ip_output() is single-
3019  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3020  * performing output at the IP layer.
3021  *
3022  * This routine is analogous to in6_selectroute() for IPv6.
3023  */
3024 static struct ifaddr *
3025 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3026 {
3027         struct ifaddr *ifa = NULL;
3028         struct in_addr src = ip->ip_src;
3029         struct in_addr dst = ip->ip_dst;
3030         struct ifnet *rt_ifp;
3031         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3032
3033         VERIFY(src.s_addr != INADDR_ANY);
3034
3035         if (ip_select_srcif_debug) {
3036                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof(s_src));
3037                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof(s_dst));
3038         }
3039
3040         if (ro->ro_rt != NULL) {
3041                 RT_LOCK(ro->ro_rt);
3042         }
3043
3044         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3045
3046         /*
3047          * Given the source IP address, find a suitable source interface
3048          * to use for transmission; if the caller has specified a scope,
3049          * optimize the search by looking at the addresses only for that
3050          * interface.  This is still suboptimal, however, as we need to
3051          * traverse the per-interface list.
3052          */
3053         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3054                 unsigned int scope = ifscope;
3055
3056                 /*
3057                  * If no scope is specified and the route is stale (pointing
3058                  * to a defunct interface) use the current primary interface;
3059                  * this happens when switching between interfaces configured
3060                  * with the same IP address.  Otherwise pick up the scope
3061                  * information from the route; the ULP may have looked up a
3062                  * correct route and we just need to verify it here and mark
3063                  * it with the ROF_SRCIF_SELECTED flag below.
3064                  */
3065                 if (scope == IFSCOPE_NONE) {
3066                         scope = rt_ifp->if_index;
3067                         if (scope != get_primary_ifscope(AF_INET) &&
3068                             ROUTE_UNUSABLE(ro)) {
3069                                 scope = get_primary_ifscope(AF_INET);
3070                         }
3071                 }
3072
3073                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3074
3075                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3076                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3077                         /*
3078                          * If forwarding is enabled, and if the packet isn't
3079                          * TCP or UDP, check if the source address belongs
3080                          * to one of our own interfaces; if so, demote the
3081                          * interface scope and do a route lookup right below.
3082                          */
3083                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3084                         if (ifa != NULL) {
3085                                 IFA_REMREF(ifa);
3086                                 ifa = NULL;
3087                                 ifscope = IFSCOPE_NONE;
3088                         }
3089                 }
3090
3091                 if (ip_select_srcif_debug && ifa != NULL) {
3092                         if (ro->ro_rt != NULL) {
3093                                 printf("%s->%s ifscope %d->%d ifa_if %s "
3094                                     "ro_if %s\n", s_src, s_dst, ifscope,
3095                                     scope, if_name(ifa->ifa_ifp),
3096                                     if_name(rt_ifp));
3097                         } else {
3098                                 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3099                                     s_src, s_dst, ifscope, scope,
3100                                     if_name(ifa->ifa_ifp));
3101                         }
3102                 }
3103         }
3104
3105         /*
3106          * Slow path; search for an interface having the corresponding source
3107          * IP address if the scope was not specified by the caller, and:
3108          *
3109          *   1) There currently isn't any route, or,
3110          *   2) The interface used by the route does not own that source
3111          *      IP address; in this case, the route will get blown away
3112          *      and we'll do a more specific scoped search using the newly
3113          *      found interface.
3114          */
3115         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3116                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3117
3118                 /*
3119                  * If we have the IP address, but not the route, we don't
3120                  * really know whether or not it belongs to the correct
3121                  * interface (it could be shared across multiple interfaces.)
3122                  * The only way to find out is to do a route lookup.
3123                  */
3124                 if (ifa != NULL && ro->ro_rt == NULL) {
3125                         struct rtentry *rt;
3126                         struct sockaddr_in sin;
3127                         struct ifaddr *oifa = NULL;
3128
3129                         bzero(&sin, sizeof(sin));
3130                         sin.sin_family = AF_INET;
3131                         sin.sin_len = sizeof(sin);
3132                         sin.sin_addr = dst;
3133
3134                         lck_mtx_lock(rnh_lock);
3135                         if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3136                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3137                                 RT_LOCK(rt);
3138                                 /*
3139                                  * If the route uses a different interface,
3140                                  * use that one instead.  The IP address of
3141                                  * the ifaddr that we pick up here is not
3142                                  * relevant.
3143                                  */
3144                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3145                                         oifa = ifa;
3146                                         ifa = rt->rt_ifa;
3147                                         IFA_ADDREF(ifa);
3148                                         RT_UNLOCK(rt);
3149                                 } else {
3150                                         RT_UNLOCK(rt);
3151                                 }
3152                                 rtfree_locked(rt);
3153                         }
3154                         lck_mtx_unlock(rnh_lock);
3155
3156                         if (oifa != NULL) {
3157                                 struct ifaddr *iifa;
3158
3159                                 /*
3160                                  * See if the interface pointed to by the
3161                                  * route is configured with the source IP
3162                                  * address of the packet.
3163                                  */
3164                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3165                                         src.s_addr, ifa->ifa_ifp->if_index);
3166
3167                                 if (iifa != NULL) {
3168                                         /*
3169                                          * Found it; drop the original one
3170                                          * as well as the route interface
3171                                          * address, and use this instead.
3172                                          */
3173                                         IFA_REMREF(oifa);
3174                                         IFA_REMREF(ifa);
3175                                         ifa = iifa;
3176                                 } else if (!ipforwarding ||
3177                                     (rt->rt_flags & RTF_GATEWAY)) {
3178                                         /*
3179                                          * This interface doesn't have that
3180                                          * source IP address; drop the route
3181                                          * interface address and just use the
3182                                          * original one, and let the caller
3183                                          * do a scoped route lookup.
3184                                          */
3185                                         IFA_REMREF(ifa);
3186                                         ifa = oifa;
3187                                 } else {
3188                                         /*
3189                                          * Forwarding is enabled and the source
3190                                          * address belongs to one of our own
3191                                          * interfaces which isn't the outgoing
3192                                          * interface, and we have a route, and
3193                                          * the destination is on a network that
3194                                          * is directly attached (onlink); drop
3195                                          * the original one and use the route
3196                                          * interface address instead.
3197                                          */
3198                                         IFA_REMREF(oifa);
3199                                 }
3200                         }
3201                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3202                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3203                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3204                         /*
3205                          * Forwarding is enabled and the source address belongs
3206                          * to one of our own interfaces which isn't the same
3207                          * as the interface used by the known route; drop the
3208                          * original one and use the route interface address.
3209                          */
3210                         IFA_REMREF(ifa);
3211                         ifa = ro->ro_rt->rt_ifa;
3212                         IFA_ADDREF(ifa);
3213                 }
3214
3215                 if (ip_select_srcif_debug && ifa != NULL) {
3216                         printf("%s->%s ifscope %d ifa_if %s\n",
3217                             s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3218                 }
3219         }
3220
3221         if (ro->ro_rt != NULL) {
3222                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3223         }
3224         /*
3225          * If there is a non-loopback route with the wrong interface, or if
3226          * there is no interface configured with such an address, blow it
3227          * away.  Except for local/loopback, we look for one with a matching
3228          * interface scope/index.
3229          */
3230         if (ro->ro_rt != NULL &&
3231             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3232             !(ro->ro_rt->rt_flags & RTF_UP))) {
3233                 if (ip_select_srcif_debug) {
3234                         if (ifa != NULL) {
3235                                 printf("%s->%s ifscope %d ro_if %s != "
3236                                     "ifa_if %s (cached route cleared)\n",
3237                                     s_src, s_dst, ifscope, if_name(rt_ifp),
3238                                     if_name(ifa->ifa_ifp));
3239                         } else {
3240                                 printf("%s->%s ifscope %d ro_if %s "
3241                                     "(no ifa_if found)\n",
3242                                     s_src, s_dst, ifscope, if_name(rt_ifp));
3243                         }
3244                 }
3245
3246                 RT_UNLOCK(ro->ro_rt);
3247                 ROUTE_RELEASE(ro);
3248
3249                 /*
3250                  * If the destination is IPv4 LLA and the route's interface
3251                  * doesn't match the source interface, then the source IP
3252                  * address is wrong; it most likely belongs to the primary
3253                  * interface associated with the IPv4 LL subnet.  Drop the
3254                  * packet rather than letting it go out and return an error
3255                  * to the ULP.  This actually applies not only to IPv4 LL
3256                  * but other shared subnets; for now we explicitly test only
3257                  * for the former case and save the latter for future.
3258                  */
3259                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3260                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3261                         IFA_REMREF(ifa);
3262                         ifa = NULL;
3263                 }
3264         }
3265
3266         if (ip_select_srcif_debug && ifa == NULL) {
3267                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3268                     s_src, s_dst, ifscope);
3269         }
3270
3271         /*
3272          * If there is a route, mark it accordingly.  If there isn't one,
3273          * we'll get here again during the next transmit (possibly with a
3274          * route) and the flag will get set at that point.  For IPv4 LLA
3275          * destination, mark it only if the route has been fully resolved;
3276          * otherwise we want to come back here again when the route points
3277          * to the interface over which the ARP reply arrives on.
3278          */
3279         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3280             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3281             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3282                 if (ifa != NULL) {
3283                         IFA_ADDREF(ifa);        /* for route */
3284                 }
3285                 if (ro->ro_srcia != NULL) {
3286                         IFA_REMREF(ro->ro_srcia);
3287                 }
3288                 ro->ro_srcia = ifa;
3289                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3290                 RT_GENID_SYNC(ro->ro_rt);
3291         }
3292
3293         if (ro->ro_rt != NULL) {
3294                 RT_UNLOCK(ro->ro_rt);
3295         }
3296
3297         return ifa;
3298 }
3299
3300 /*
3301  * @brief       Given outgoing interface it determines what checksum needs
3302  *      to be computed in software and what needs to be offloaded to the
3303  *      interface.
3304  *
3305  * @param       ifp Pointer to the outgoing interface
3306  * @param       m Pointer to the packet
3307  * @param       hlen IP header length
3308  * @param       ip_len Total packet size i.e. headers + data payload
3309  * @param       sw_csum Pointer to a software checksum flag set
3310  *
3311  * @return      void
3312  */
3313 void
3314 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3315     uint32_t *sw_csum)
3316 {
3317         int tso = TSO_IPV4_OK(ifp, m);
3318         uint32_t hwcap = ifp->if_hwassist;
3319
3320         m->m_pkthdr.csum_flags |= CSUM_IP;
3321
3322         if (!hwcksum_tx) {
3323                 /* do all in software; hardware checksum offload is disabled */
3324                 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3325                     m->m_pkthdr.csum_flags;
3326         } else {
3327                 /* do in software what the hardware cannot */
3328                 *sw_csum = m->m_pkthdr.csum_flags &
3329                     ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3330         }
3331
3332         if (hlen != sizeof(struct ip)) {
3333                 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3334                     m->m_pkthdr.csum_flags);
3335         } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3336                 int interface_mtu = ifp->if_mtu;
3337
3338                 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3339                         interface_mtu = IN6_LINKMTU(ifp);
3340                         /* Further adjust the size for CLAT46 expansion */
3341                         interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3342                 }
3343
3344                 /*
3345                  * Partial checksum offload, if non-IP fragment, and TCP only
3346                  * (no UDP support, as the hardware may not be able to convert
3347                  * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3348                  * supports "invert zero" capability.)
3349                  */
3350                 if (hwcksum_tx && !tso &&
3351                     ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3352                     ((hwcap & CSUM_ZERO_INVERT) &&
3353                     (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3354                     ip_len <= interface_mtu) {
3355                         uint16_t start = sizeof(struct ip);
3356                         uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3357                         m->m_pkthdr.csum_flags |=
3358                             (CSUM_DATA_VALID | CSUM_PARTIAL);
3359                         m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3360                         m->m_pkthdr.csum_tx_start = start;
3361                         /* do IP hdr chksum in software */
3362                         *sw_csum = CSUM_DELAY_IP;
3363                 } else {
3364                         *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3365                 }
3366         }
3367
3368         if (*sw_csum & CSUM_DELAY_DATA) {
3369                 in_delayed_cksum(m);
3370                 *sw_csum &= ~CSUM_DELAY_DATA;
3371         }
3372
3373         if (hwcksum_tx) {
3374                 /*
3375                  * Drop off bits that aren't supported by hardware;
3376                  * also make sure to preserve non-checksum related bits.
3377                  */
3378                 m->m_pkthdr.csum_flags =
3379                     ((m->m_pkthdr.csum_flags &
3380                     (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) |
3381                     (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3382         } else {
3383                 /* drop all bits; hardware checksum offload is disabled */
3384                 m->m_pkthdr.csum_flags = 0;
3385         }
3386 }
3387
3388 /*
3389  * GRE protocol output for PPP/PPTP
3390  */
3391 int
3392 ip_gre_output(struct mbuf *m)
3393 {
3394         struct route ro;
3395         int error;
3396
3397         bzero(&ro, sizeof(ro));
3398
3399         error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3400
3401         ROUTE_RELEASE(&ro);
3402
3403         return error;
3404 }
3405
3406 static int
3407 sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3408 {
3409 #pragma unused(arg1, arg2)
3410         int error, i;
3411
3412         i = ip_output_measure;
3413         error = sysctl_handle_int(oidp, &i, 0, req);
3414         if (error || req->newptr == USER_ADDR_NULL) {
3415                 goto done;
3416         }
3417         /* impose bounds */
3418         if (i < 0 || i > 1) {
3419                 error = EINVAL;
3420                 goto done;
3421         }
3422         if (ip_output_measure != i && i == 1) {
3423                 net_perf_initialize(&net_perf, ip_output_measure_bins);
3424         }
3425         ip_output_measure = i;
3426 done:
3427         return error;
3428 }
3429
3430 static int
3431 sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3432 {
3433 #pragma unused(arg1, arg2)
3434         int error;
3435         uint64_t i;
3436
3437         i = ip_output_measure_bins;
3438         error = sysctl_handle_quad(oidp, &i, 0, req);
3439         if (error || req->newptr == USER_ADDR_NULL) {
3440                 goto done;
3441         }
3442         /* validate data */
3443         if (!net_perf_validate_bins(i)) {
3444                 error = EINVAL;
3445                 goto done;
3446         }
3447         ip_output_measure_bins = i;
3448 done:
3449         return error;
3450 }
3451
3452 static int
3453 sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3454 {
3455 #pragma unused(oidp, arg1, arg2)
3456         if (req->oldptr == USER_ADDR_NULL) {
3457                 req->oldlen = (size_t)sizeof(struct ipstat);
3458         }
3459
3460         return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3461 }