bsd/netinet/ip_output.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  61  * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/kernel.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <kern/locks.h>
  81 #include <sys/sysctl.h>
  82
  83 #include <machine/endian.h>
  84
  85 #include <net/if.h>
  86 #include <net/if_dl.h>
  87 #include <net/route.h>
  88
  89 #include <netinet/in.h>
  90 #include <netinet/in_systm.h>
  91 #include <netinet/ip.h>
  92 #include <netinet/in_pcb.h>
  93 #include <netinet/in_var.h>
  94 #include <netinet/ip_var.h>
  95
  96 #include <netinet/kpi_ipfilter_var.h>
  97
  98 #if CONFIG_MACF_NET
  99 #include <security/mac_framework.h>
 100 #endif
 101
 102 #include "faith.h"
 103
 104 #include <net/dlil.h>
 105 #include <sys/kdebug.h>
 106 #include <libkern/OSAtomic.h>
 107
 108 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 1)
 109 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 3)
 110 #define DBG_FNC_IP_OUTPUT       NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
 111 #define DBG_FNC_IPSEC4_OUTPUT   NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
 112
 113 #define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
 114
 115 #if IPSEC
 116 #include <netinet6/ipsec.h>
 117 #include <netkey/key.h>
 118 #if IPSEC_DEBUG
 119 #include <netkey/key_debug.h>
 120 #else
 121 #define KEYDEBUG(lev,arg)
 122 #endif
 123 #endif /*IPSEC*/
 124
 125 #include <netinet/ip_fw.h>
 126 #include <netinet/ip_divert.h>
 127
 128 #if DUMMYNET
 129 #include <netinet/ip_dummynet.h>
 130 #endif
 131
 132 #if PF
 133 #include <net/pfvar.h>
 134 #endif /* PF */
 135
 136 #if IPFIREWALL_FORWARD_DEBUG
 137 #define print_ip(a)      printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
 138                                                   (ntohl(a.s_addr)>>16)&0xFF,\
 139                                                   (ntohl(a.s_addr)>>8)&0xFF,\
 140                                                   (ntohl(a.s_addr))&0xFF);
 141 #endif
 142
 143
 144 u_short ip_id;
 145
 146 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 147 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
 148 static void     ip_mloopback(struct ifnet *, struct mbuf *,
 149         struct sockaddr_in *, int);
 150 static int      ip_getmoptions(struct sockopt *, struct ip_moptions *);
 151 static int      ip_pcbopts(int, struct mbuf **, struct mbuf *);
 152 static int      ip_setmoptions(struct sockopt *, struct ip_moptions **);
 153
 154 static void ip_out_cksum_stats(int, u_int32_t);
 155 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
 156 static void ip_bindif(struct inpcb *, unsigned int);
 157
 158 int ip_createmoptions(struct ip_moptions **imop);
 159 int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq);
 160 int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq);
 161 int     ip_optcopy(struct ip *, struct ip *);
 162 void in_delayed_cksum_offset(struct mbuf *, int );
 163 void in_cksum_offset(struct mbuf* , size_t );
 164
 165 extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **);
 166
 167 extern  struct protosw inetsw[];
 168
 169 extern struct ip_linklocal_stat ip_linklocal_stat;
 170 extern lck_mtx_t *ip_mutex;
 171
 172 /* temporary: for testing */
 173 #if IPSEC
 174 extern int ipsec_bypass;
 175 #endif
 176
 177 static int      ip_maxchainsent = 0;
 178 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW,
 179     &ip_maxchainsent, 0, "use dlil_output_list");
 180 #if DEBUG
 181 static int forge_ce = 0;
 182 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW,
 183     &forge_ce, 0, "Forge ECN CE");
 184 #endif /* DEBUG */
 185
 186 static int ip_select_srcif_debug = 0;
 187 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW,
 188     &ip_select_srcif_debug, 0, "log source interface selection debug info");
 189
 190 /*
 191  * IP output.  The packet in mbuf chain m contains a skeletal IP
 192  * header (with len, off, ttl, proto, tos, src, dst).
 193  * The mbuf chain containing the packet will be freed.
 194  * The mbuf opt, if present, will not be freed.
 195  */
 196 int
 197 ip_output(
 198         struct mbuf *m0,
 199         struct mbuf *opt,
 200         struct route *ro,
 201         int flags,
 202         struct ip_moptions *imo,
 203         struct ip_out_args *ipoa)
 204 {
 205         int error;
 206         error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
 207         return error;
 208 }
 209
 210 /*
 211  * Returns:     0                       Success
 212  *              ENOMEM
 213  *              EADDRNOTAVAIL
 214  *              ENETUNREACH
 215  *              EHOSTUNREACH
 216  *              EACCES
 217  *              EMSGSIZE
 218  *              ENOBUFS
 219  *      ipsec4_getpolicybyaddr:???      [IPSEC 4th argument, contents modified]
 220  *      ipsec4_getpolicybysock:???      [IPSEC 4th argument, contents modified]
 221  *      key_spdacquire:???              [IPSEC]
 222  *      ipsec4_output:???               [IPSEC]
 223  *      <fr_checkp>:???                 [firewall]
 224  *      ip_dn_io_ptr:???                [dummynet]
 225  *      dlil_output:???                 [DLIL]
 226  *      dlil_output_list:???            [DLIL]
 227  *
 228  * Notes:       The ipsec4_getpolicyby{addr|sock} function error returns are
 229  *              only used as the error return from this function where one of
 230  *              these functions fails to return a policy.
 231  */
 232 int
 233 ip_output_list(
 234         struct mbuf *m0,
 235         int packetchain,
 236         struct mbuf *opt,
 237         struct route *ro,
 238         int flags,
 239         struct ip_moptions *imo,
 240         struct ip_out_args *ipoa
 241         )
 242 {
 243         struct ip *ip;
 244         struct ifnet *ifp = NULL;
 245         struct mbuf *m = m0, **mppn = NULL;
 246         int hlen = sizeof (struct ip);
 247         int len = 0, off, error = 0;
 248         struct sockaddr_in *dst = NULL;
 249         struct in_ifaddr *ia = NULL, *src_ia = NULL;
 250         int isbroadcast, sw_csum;
 251         struct in_addr pkt_dst;
 252 #if IPSEC
 253         struct route iproute;
 254         struct socket *so = NULL;
 255         struct secpolicy *sp = NULL;
 256 #endif
 257 #if IPFIREWALL_FORWARD
 258         int fwd_rewrite_src = 0;
 259 #endif
 260 #if IPFIREWALL
 261         struct ip_fw_args args;
 262 #endif
 263         int didfilter = 0;
 264         ipfilter_t inject_filter_ref = 0;
 265         struct m_tag    *tag;
 266         struct route    saved_route;
 267         struct ip_out_args saved_ipoa;
 268         struct mbuf * packetlist;
 269         int pktcnt = 0, tso = 0;
 270         unsigned int ifscope;
 271         boolean_t select_srcif;
 272
 273         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 274
 275         packetlist = m0;
 276 #if IPFIREWALL
 277         args.next_hop = NULL;
 278         args.eh = NULL;
 279         args.rule = NULL;
 280         args.divert_rule = 0;                   /* divert cookie */
 281         args.ipoa = NULL;
 282
 283         if (SLIST_EMPTY(&m0->m_pkthdr.tags))
 284                 goto ipfw_tags_done;
 285
 286         /* Grab info from mtags prepended to the chain */
 287 #if DUMMYNET
 288         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 289             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
 290                 struct dn_pkt_tag       *dn_tag;
 291
 292                 dn_tag = (struct dn_pkt_tag *)(tag+1);
 293                 args.rule = dn_tag->rule;
 294                 opt = NULL;
 295                 saved_route = dn_tag->ro;
 296                 ro = &saved_route;
 297
 298                 imo = NULL;
 299                 dst = dn_tag->dn_dst;
 300                 ifp = dn_tag->ifp;
 301                 flags = dn_tag->flags;
 302                 saved_ipoa = dn_tag->ipoa;
 303                 ipoa = &saved_ipoa;
 304
 305                 m_tag_delete(m0, tag);
 306         }
 307 #endif /* DUMMYNET */
 308
 309 #if IPDIVERT
 310         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 311             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
 312                 struct divert_tag       *div_tag;
 313
 314                 div_tag = (struct divert_tag *)(tag+1);
 315                 args.divert_rule = div_tag->cookie;
 316
 317                 m_tag_delete(m0, tag);
 318         }
 319 #endif /* IPDIVERT */
 320
 321         if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
 322             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
 323                 struct ip_fwd_tag       *ipfwd_tag;
 324
 325                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
 326                 args.next_hop = ipfwd_tag->next_hop;
 327
 328                 m_tag_delete(m0, tag);
 329         }
 330 ipfw_tags_done:
 331 #endif /* IPFIREWALL */
 332
 333         m = m0;
 334
 335 #if     DIAGNOSTIC
 336         if ( !m || (m->m_flags & M_PKTHDR) != 0)
 337                 panic("ip_output no HDR");
 338         if (!ro)
 339                 panic("ip_output no route, proto = %d",
 340                       mtod(m, struct ip *)->ip_p);
 341 #endif
 342
 343         /*
 344          * At present the IP_OUTARGS flag implies a request for IP to
 345          * perform source interface selection.  In the forwarding case,
 346          * only the ifscope value is used, as source interface selection
 347          * doesn't take place.
 348          */
 349         if (ip_doscopedroute && (flags & IP_OUTARGS)) {
 350                 select_srcif = !(flags & IP_FORWARDING);
 351                 ifscope = ipoa->ipoa_ifscope;
 352         } else {
 353                 select_srcif = FALSE;
 354                 ifscope = IFSCOPE_NONE;
 355         }
 356
 357 #if IPFIREWALL
 358         if (args.rule != NULL) {        /* dummynet already saw us */
 359                 ip = mtod(m, struct ip *);
 360                 hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
 361                 if (ro->ro_rt != NULL) {
 362                         RT_LOCK_SPIN(ro->ro_rt);
 363                         ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
 364                         if (ia)
 365                                 ifaref(&ia->ia_ifa);
 366                         RT_UNLOCK(ro->ro_rt);
 367                 }
 368 #if IPSEC
 369                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 370                         so = ipsec_getsocket(m);
 371                         (void)ipsec_setsocket(m, NULL);
 372                 }
 373 #endif
 374                 goto sendit;
 375         }
 376 #endif /* IPFIREWALL */
 377
 378 #if IPSEC
 379         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
 380                 so = ipsec_getsocket(m);
 381                 (void)ipsec_setsocket(m, NULL);
 382         }
 383 #endif
 384 loopit:
 385         /*
 386          * No need to proccess packet twice if we've
 387          * already seen it
 388          */
 389         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
 390                 inject_filter_ref = ipf_get_inject_filter(m);
 391         else
 392                 inject_filter_ref = 0;
 393
 394         if (opt) {
 395                 m = ip_insertoptions(m, opt, &len);
 396                 hlen = len;
 397         }
 398         ip = mtod(m, struct ip *);
 399 #if IPFIREWALL
 400         pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
 401 #else
 402         pkt_dst = ip->ip_dst;
 403 #endif
 404
 405         /*
 406          * Fill in IP header.
 407          */
 408         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 409                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
 410                 ip->ip_off &= IP_DF;
 411 #if RANDOM_IP_ID
 412                 ip->ip_id = ip_randomid();
 413 #else
 414                 ip->ip_id = htons(ip_id++);
 415 #endif
 416                 OSAddAtomic(1, &ipstat.ips_localout);
 417         } else {
 418                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 419         }
 420
 421 #if DEBUG
 422         /* For debugging, we let the stack forge congestion */
 423         if (forge_ce != 0 &&
 424                 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
 425                  (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
 426                 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
 427                 forge_ce--;
 428         }
 429 #endif /* DEBUG */
 430
 431         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr,
 432                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
 433
 434         dst = (struct sockaddr_in *)&ro->ro_dst;
 435
 436         /*
 437          * If there is a cached route,
 438          * check that it is to the same destination
 439          * and is still up.  If not, free it and try again.
 440          * The address family should also be checked in case of sharing the
 441          * cache with IPv6.
 442          */
 443
 444         if (ro->ro_rt != NULL) {
 445                 if (ro->ro_rt->generation_id != route_generation &&
 446                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) &&
 447                     (ip->ip_src.s_addr != INADDR_ANY)) {
 448                         src_ia = ifa_foraddr(ip->ip_src.s_addr);
 449                         if (src_ia == NULL) {
 450                                 error = EADDRNOTAVAIL;
 451                                 goto bad;
 452                         }
 453                         ifafree(&src_ia->ia_ifa);
 454                 }
 455                 /*
 456                  * Test rt_flags without holding rt_lock for performance
 457                  * reasons; if the route is down it will hopefully be
 458                  * caught by the layer below (since it uses this route
 459                  * as a hint) or during the next transmit.
 460                  */
 461                 if ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 462                     dst->sin_family != AF_INET ||
 463                     dst->sin_addr.s_addr != pkt_dst.s_addr) {
 464                         rtfree(ro->ro_rt);
 465                         ro->ro_rt = NULL;
 466                 }
 467                 /*
 468                  * If we're doing source interface selection, we may not
 469                  * want to use this route; only synch up the generation
 470                  * count otherwise.
 471                  */
 472                 if (!select_srcif && ro->ro_rt != NULL &&
 473                     ro->ro_rt->generation_id != route_generation)
 474                         ro->ro_rt->generation_id = route_generation;
 475         }
 476         if (ro->ro_rt == NULL) {
 477                 bzero(dst, sizeof(*dst));
 478                 dst->sin_family = AF_INET;
 479                 dst->sin_len = sizeof(*dst);
 480                 dst->sin_addr = pkt_dst;
 481         }
 482         /*
 483          * If routing to interface only,
 484          * short circuit routing lookup.
 485          */
 486 #define ifatoia(ifa)    ((struct in_ifaddr *)(ifa))
 487 #define sintosa(sin)    ((struct sockaddr *)(sin))
 488         if (flags & IP_ROUTETOIF) {
 489                 if (ia)
 490                         ifafree(&ia->ia_ifa);
 491                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) {
 492                         if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
 493                                 OSAddAtomic(1, &ipstat.ips_noroute);
 494                                 error = ENETUNREACH;
 495                                 goto bad;
 496                         }
 497                 }
 498                 ifp = ia->ia_ifp;
 499                 ip->ip_ttl = 1;
 500                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
 501         } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
 502             imo != NULL && imo->imo_multicast_ifp != NULL) {
 503                 /*
 504                  * Bypass the normal routing lookup for multicast
 505                  * packets if the interface is specified.
 506                  */
 507                 ifp = imo->imo_multicast_ifp;
 508                 isbroadcast = 0;
 509                 if (ia != NULL)
 510                         ifafree(&ia->ia_ifa);
 511
 512                 /* Macro takes reference on ia */
 513                 IFP_TO_IA(ifp, ia);
 514         } else {
 515                 boolean_t cloneok = FALSE;
 516                 /*
 517                  * Perform source interface selection; the source IP address
 518                  * must belong to one of the addresses of the interface used
 519                  * by the route.  For performance reasons, do this only if
 520                  * there is no route, or if the routing table has changed,
 521                  * or if we haven't done source interface selection on this
 522                  * route (for this PCB instance) before.
 523                  */
 524                 if (select_srcif && ip->ip_src.s_addr != INADDR_ANY &&
 525                     (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) ||
 526                     ro->ro_rt->generation_id != route_generation ||
 527                     !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
 528                         struct ifaddr *ifa;
 529
 530                         /* Find the source interface */
 531                         ifa = in_selectsrcif(ip, ro, ifscope);
 532
 533                         /*
 534                          * If the source address is spoofed (in the case
 535                          * of IP_RAWOUTPUT), or if this is destined for
 536                          * local/loopback, just let it go out using the
 537                          * interface of the route.  Otherwise, there's no
 538                          * interface having such an address, so bail out.
 539                          */
 540                         if (ifa == NULL && !(flags & IP_RAWOUTPUT) &&
 541                             ifscope != lo_ifp->if_index) {
 542                                 error = EADDRNOTAVAIL;
 543                                 goto bad;
 544                         }
 545
 546                         /*
 547                          * If the caller didn't explicitly specify the scope,
 548                          * pick it up from the source interface.  If the cached
 549                          * route was wrong and was blown away as part of source
 550                          * interface selection, don't mask out RTF_PRCLONING
 551                          * since that route may have been allocated by the ULP,
 552                          * unless the IP header was created by the caller or
 553                          * the destination is IPv4 LLA.  The check for the
 554                          * latter is needed because IPv4 LLAs are never scoped
 555                          * in the current implementation, and we don't want to
 556                          * replace the resolved IPv4 LLA route with one whose
 557                          * gateway points to that of the default gateway on
 558                          * the primary interface of the system.
 559                          */
 560                         if (ifa != NULL) {
 561                                 if (ifscope == IFSCOPE_NONE)
 562                                         ifscope = ifa->ifa_ifp->if_index;
 563                                 ifafree(ifa);
 564                                 cloneok = (!(flags & IP_RAWOUTPUT) &&
 565                                     !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
 566                         }
 567                 }
 568
 569                 /*
 570                  * If this is the case, we probably don't want to allocate
 571                  * a protocol-cloned route since we didn't get one from the
 572                  * ULP.  This lets TCP do its thing, while not burdening
 573                  * forwarding or ICMP with the overhead of cloning a route.
 574                  * Of course, we still want to do any cloning requested by
 575                  * the link layer, as this is probably required in all cases
 576                  * for correct operation (as it is for ARP).
 577                  */
 578                 if (ro->ro_rt == NULL) {
 579                         unsigned long ign = RTF_PRCLONING;
 580                         /*
 581                          * We make an exception here: if the destination
 582                          * address is INADDR_BROADCAST, allocate a protocol-
 583                          * cloned host route so that we end up with a route
 584                          * marked with the RTF_BROADCAST flag.  Otherwise,
 585                          * we would end up referring to the default route,
 586                          * instead of creating a cloned host route entry.
 587                          * That would introduce inconsistencies between ULPs
 588                          * that allocate a route and those that don't.  The
 589                          * RTF_BROADCAST route is important since we'd want
 590                          * to send out undirected IP broadcast packets using
 591                          * link-level broadcast address. Another exception
 592                          * is for ULP-created routes that got blown away by
 593                          * source interface selection (see above).
 594                          *
 595                          * These exceptions will no longer be necessary when
 596                          * the RTF_PRCLONING scheme is no longer present.
 597                          */
 598                         if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
 599                                 ign &= ~RTF_PRCLONING;
 600
 601                         /*
 602                          * Loosen the route lookup criteria if the ifscope
 603                          * corresponds to the loopback interface; this is
 604                          * needed to support Application Layer Gateways
 605                          * listening on loopback, in conjunction with packet
 606                          * filter redirection rules.  The final source IP
 607                          * address will be rewritten by the packet filter
 608                          * prior to the RFC1122 loopback check below.
 609                          */
 610                         if (ifscope == lo_ifp->if_index)
 611                                 rtalloc_ign(ro, ign);
 612                         else
 613                                 rtalloc_scoped_ign(ro, ign, ifscope);
 614                 }
 615
 616                 if (ro->ro_rt == NULL) {
 617                         OSAddAtomic(1, &ipstat.ips_noroute);
 618                         error = EHOSTUNREACH;
 619                         goto bad;
 620                 }
 621
 622                 if (ia)
 623                         ifafree(&ia->ia_ifa);
 624                 RT_LOCK_SPIN(ro->ro_rt);
 625                 ia = ifatoia(ro->ro_rt->rt_ifa);
 626                 if (ia)
 627                         ifaref(&ia->ia_ifa);
 628                 ifp = ro->ro_rt->rt_ifp;
 629                 ro->ro_rt->rt_use++;
 630                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 631                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 632                 if (ro->ro_rt->rt_flags & RTF_HOST) {
 633                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 634                 } else {
 635                         /* Become a regular mutex */
 636                         RT_CONVERT_LOCK(ro->ro_rt);
 637                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
 638                 }
 639                 RT_UNLOCK(ro->ro_rt);
 640         }
 641
 642         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
 643                 struct in_multi *inm;
 644
 645                 m->m_flags |= M_MCAST;
 646                 /*
 647                  * IP destination address is multicast.  Make sure "dst"
 648                  * still points to the address in "ro".  (It may have been
 649                  * changed to point to a gateway address, above.)
 650                  */
 651                 dst = (struct sockaddr_in *)&ro->ro_dst;
 652                 /*
 653                  * See if the caller provided any multicast options
 654                  */
 655                 if (imo != NULL) {
 656                         if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl;
 657                         if (imo->imo_multicast_ifp != NULL) {
 658                                 ifp = imo->imo_multicast_ifp;
 659                         }
 660 #if MROUTING
 661                         if (imo->imo_multicast_vif != -1 &&
 662                                 ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY))
 663                                 ip->ip_src.s_addr =
 664                                         ip_mcast_src(imo->imo_multicast_vif);
 665 #endif /* MROUTING */
 666                 } else
 667                         if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 668                 /*
 669                  * Confirm that the outgoing interface supports multicast.
 670                  */
 671                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 672                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 673                                 OSAddAtomic(1, &ipstat.ips_noroute);
 674                                 error = ENETUNREACH;
 675                                 goto bad;
 676                         }
 677                 }
 678                 /*
 679                  * If source address not specified yet, use address
 680                  * of outgoing interface.
 681                  */
 682                 if (ip->ip_src.s_addr == INADDR_ANY) {
 683                         struct in_ifaddr *ia1;
 684                         lck_rw_lock_shared(in_ifaddr_rwlock);
 685                         TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link)
 686                                 if (ia1->ia_ifp == ifp) {
 687                                         ip->ip_src = IA_SIN(ia1)->sin_addr;
 688                                         break;
 689                                 }
 690                         lck_rw_done(in_ifaddr_rwlock);
 691                         if (ip->ip_src.s_addr == INADDR_ANY) {
 692                                 error = ENETUNREACH;
 693                                 goto bad;
 694                         }
 695                 }
 696
 697                 ifnet_lock_shared(ifp);
 698                 IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
 699                 ifnet_lock_done(ifp);
 700                 if (inm != NULL &&
 701                    (imo == NULL || imo->imo_multicast_loop)) {
 702                         /*
 703                          * If we belong to the destination multicast group
 704                          * on the outgoing interface, and the caller did not
 705                          * forbid loopback, loop back a copy.
 706                          */
 707                         if (!TAILQ_EMPTY(&ipv4_filters)) {
 708                                 struct ipfilter *filter;
 709                                 int seen = (inject_filter_ref == 0);
 710                                 struct ipf_pktopts *ippo = 0, ipf_pktopts;
 711
 712                                 if (imo) {
 713                                         ippo = &ipf_pktopts;
 714                                         ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp;
 715                                         ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl;
 716                                         ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop;
 717                                 }
 718
 719                                 ipf_ref();
 720
 721                                 /* 4135317 - always pass network byte order to filter */
 722
 723 #if BYTE_ORDER != BIG_ENDIAN
 724                                 HTONS(ip->ip_len);
 725                                 HTONS(ip->ip_off);
 726 #endif
 727
 728                                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 729                                         if (seen == 0) {
 730                                                 if ((struct ipfilter *)inject_filter_ref == filter)
 731                                                         seen = 1;
 732                                         } else if (filter->ipf_filter.ipf_output) {
 733                                                 errno_t result;
 734                                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
 735                                                 if (result == EJUSTRETURN) {
 736                                                         ipf_unref();
 737                                                         goto done;
 738                                                 }
 739                                                 if (result != 0) {
 740                                                         ipf_unref();
 741                                                         goto bad;
 742                                                 }
 743                                         }
 744                                 }
 745
 746                                 /* set back to host byte order */
 747                                 ip = mtod(m, struct ip *);
 748
 749 #if BYTE_ORDER != BIG_ENDIAN
 750                                 NTOHS(ip->ip_len);
 751                                 NTOHS(ip->ip_off);
 752 #endif
 753
 754                                 ipf_unref();
 755                                 didfilter = 1;
 756                         }
 757                         ip_mloopback(ifp, m, dst, hlen);
 758                 }
 759 #if MROUTING
 760                 else {
 761                         /*
 762                          * If we are acting as a multicast router, perform
 763                          * multicast forwarding as if the packet had just
 764                          * arrived on the interface to which we are about
 765                          * to send.  The multicast forwarding function
 766                          * recursively calls this function, using the
 767                          * IP_FORWARDING flag to prevent infinite recursion.
 768                          *
 769                          * Multicasts that are looped back by ip_mloopback(),
 770                          * above, will be forwarded by the ip_input() routine,
 771                          * if necessary.
 772                          */
 773                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
 774                                 /*
 775                                  * Check if rsvp daemon is running. If not, don't
 776                                  * set ip_moptions. This ensures that the packet
 777                                  * is multicast and not just sent down one link
 778                                  * as prescribed by rsvpd.
 779                                  */
 780                                 if (!rsvp_on)
 781                                   imo = NULL;
 782                                 if (ip_mforward(ip, ifp, m, imo) != 0) {
 783                                         m_freem(m);
 784                                         goto done;
 785                                 }
 786                         }
 787                 }
 788 #endif /* MROUTING */
 789
 790                 /*
 791                  * Multicasts with a time-to-live of zero may be looped-
 792                  * back, above, but must not be transmitted on a network.
 793                  * Also, multicasts addressed to the loopback interface
 794                  * are not sent -- the above call to ip_mloopback() will
 795                  * loop back a copy if this host actually belongs to the
 796                  * destination group on the loopback interface.
 797                  */
 798                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 799                         m_freem(m);
 800                         goto done;
 801                 }
 802
 803                 goto sendit;
 804         }
 805 #ifndef notdef
 806         /*
 807          * If source address not specified yet, use address
 808          * of outgoing interface.
 809          */
 810         if (ip->ip_src.s_addr == INADDR_ANY) {
 811                 ip->ip_src = IA_SIN(ia)->sin_addr;
 812 #if IPFIREWALL_FORWARD
 813                 /* Keep note that we did this - if the firewall changes
 814                  * the next-hop, our interface may change, changing the
 815                  * default source IP. It's a shame so much effort happens
 816                  * twice. Oh well.
 817                  */
 818                 fwd_rewrite_src++;
 819 #endif /* IPFIREWALL_FORWARD */
 820         }
 821 #endif /* notdef */
 822
 823         /*
 824          * Look for broadcast address and
 825          * and verify user is allowed to send
 826          * such a packet.
 827          */
 828         if (isbroadcast) {
 829                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 830                         error = EADDRNOTAVAIL;
 831                         goto bad;
 832                 }
 833                 if ((flags & IP_ALLOWBROADCAST) == 0) {
 834                         error = EACCES;
 835                         goto bad;
 836                 }
 837                 /* don't allow broadcast messages to be fragmented */
 838                 if ((u_short)ip->ip_len > ifp->if_mtu) {
 839                         error = EMSGSIZE;
 840                         goto bad;
 841                 }
 842                 m->m_flags |= M_BCAST;
 843         } else {
 844                 m->m_flags &= ~M_BCAST;
 845         }
 846
 847 sendit:
 848 #if PF
 849         /* Invoke outbound packet filter */
 850         if (pf_af_hook(ifp, mppn, &m, AF_INET, FALSE) != 0) {
 851                 if (packetlist == m0) {
 852                         packetlist = m;
 853                         mppn = NULL;
 854                 }
 855                 if (m != NULL) {
 856                         m0 = m;
 857                         /* Next packet in the chain */
 858                         goto loopit;
 859                 } else if (packetlist != NULL) {
 860                         /* No more packet; send down the chain */
 861                         goto sendchain;
 862                 }
 863                 /* Nothing left; we're done */
 864                 goto done;
 865         }
 866         m0 = m;
 867         ip = mtod(m, struct ip *);
 868         pkt_dst = ip->ip_dst;
 869         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 870 #endif /* PF */
 871         /*
 872          * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
 873          */
 874         if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 875                 ip_linklocal_stat.iplls_out_total++;
 876                 if (ip->ip_ttl != MAXTTL) {
 877                         ip_linklocal_stat.iplls_out_badttl++;
 878                         ip->ip_ttl = MAXTTL;
 879                 }
 880         }
 881
 882         if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
 883                 struct ipfilter *filter;
 884                 int seen = (inject_filter_ref == 0);
 885
 886                 /* Check that a TSO frame isn't passed to a filter.
 887                  * This could happen if a filter is inserted while
 888                  * TCP is sending the TSO packet.
 889                  */
 890                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
 891                         error = EMSGSIZE;
 892                         goto bad;
 893                 }
 894
 895                 ipf_ref();
 896
 897                 /* 4135317 - always pass network byte order to filter */
 898
 899 #if BYTE_ORDER != BIG_ENDIAN
 900                 HTONS(ip->ip_len);
 901                 HTONS(ip->ip_off);
 902 #endif
 903
 904                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 905                         if (seen == 0) {
 906                                 if ((struct ipfilter *)inject_filter_ref == filter)
 907                                         seen = 1;
 908                         } else if (filter->ipf_filter.ipf_output) {
 909                                 errno_t result;
 910                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0);
 911                                 if (result == EJUSTRETURN) {
 912                                         ipf_unref();
 913                                         goto done;
 914                                 }
 915                                 if (result != 0) {
 916                                         ipf_unref();
 917                                         goto bad;
 918                                 }
 919                         }
 920                 }
 921
 922                 /* set back to host byte order */
 923                 ip = mtod(m, struct ip *);
 924
 925 #if BYTE_ORDER != BIG_ENDIAN
 926                 NTOHS(ip->ip_len);
 927                 NTOHS(ip->ip_off);
 928 #endif
 929
 930                 ipf_unref();
 931         }
 932
 933 #if IPSEC
 934         /* temporary for testing only: bypass ipsec alltogether */
 935
 936         if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0)
 937                 goto skip_ipsec;
 938
 939         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 940
 941
 942         /* get SP for this packet */
 943         if (so == NULL)
 944                 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
 945         else
 946                 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
 947
 948         if (sp == NULL) {
 949                 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
 950                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 951                 goto bad;
 952         }
 953
 954         error = 0;
 955
 956         /* check policy */
 957         switch (sp->policy) {
 958         case IPSEC_POLICY_DISCARD:
 959         case IPSEC_POLICY_GENERATE:
 960                 /*
 961                  * This packet is just discarded.
 962                  */
 963                 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
 964                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0);
 965                 goto bad;
 966
 967         case IPSEC_POLICY_BYPASS:
 968         case IPSEC_POLICY_NONE:
 969                 /* no need to do IPsec. */
 970                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0);
 971                 goto skip_ipsec;
 972
 973         case IPSEC_POLICY_IPSEC:
 974                 if (sp->req == NULL) {
 975                         /* acquire a policy */
 976                         error = key_spdacquire(sp);
 977                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0);
 978                         goto bad;
 979                 }
 980                 break;
 981
 982         case IPSEC_POLICY_ENTRUST:
 983         default:
 984                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
 985         }
 986     {
 987         struct ipsec_output_state state;
 988         bzero(&state, sizeof(state));
 989         state.m = m;
 990         if (flags & IP_ROUTETOIF) {
 991                 state.ro = &iproute;
 992                 bzero(&iproute, sizeof(iproute));
 993         } else
 994                 state.ro = ro;
 995         state.dst = (struct sockaddr *)dst;
 996
 997         ip->ip_sum = 0;
 998
 999         /*
1000          * XXX
1001          * delayed checksums are not currently compatible with IPsec
1002          */
1003         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1004                 in_delayed_cksum(m);
1005                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1006         }
1007
1008
1009 #if BYTE_ORDER != BIG_ENDIAN
1010         HTONS(ip->ip_len);
1011         HTONS(ip->ip_off);
1012 #endif
1013
1014         error = ipsec4_output(&state, sp, flags);
1015
1016         m0 = m = state.m;
1017
1018         if (flags & IP_ROUTETOIF) {
1019                 /*
1020                  * if we have tunnel mode SA, we may need to ignore
1021                  * IP_ROUTETOIF.
1022                  */
1023                 if (state.ro != &iproute || state.ro->ro_rt != NULL) {
1024                         flags &= ~IP_ROUTETOIF;
1025                         ro = state.ro;
1026                 }
1027         } else
1028                 ro = state.ro;
1029
1030         dst = (struct sockaddr_in *)state.dst;
1031         if (error) {
1032                 /* mbuf is already reclaimed in ipsec4_output. */
1033                 m0 = NULL;
1034                 switch (error) {
1035                 case EHOSTUNREACH:
1036                 case ENETUNREACH:
1037                 case EMSGSIZE:
1038                 case ENOBUFS:
1039                 case ENOMEM:
1040                         break;
1041                 default:
1042                         printf("ip4_output (ipsec): error code %d\n", error);
1043                         /*fall through*/
1044                 case ENOENT:
1045                         /* don't show these error codes to the user */
1046                         error = 0;
1047                         break;
1048                 }
1049                 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0);
1050                 goto bad;
1051         }
1052     }
1053
1054         /* be sure to update variables that are affected by ipsec4_output() */
1055         ip = mtod(m, struct ip *);
1056
1057 #ifdef _IP_VHL
1058         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1059 #else
1060         hlen = ip->ip_hl << 2;
1061 #endif
1062         /* Check that there wasn't a route change and src is still valid */
1063         if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) {
1064                 if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL &&
1065                     ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) {
1066                         error = EADDRNOTAVAIL;
1067                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1068                             5,0,0,0,0);
1069                         goto bad;
1070                 }
1071                 rtfree(ro->ro_rt);
1072                 ro->ro_rt = NULL;
1073                 if (src_ia != NULL)
1074                         ifafree(&src_ia->ia_ifa);
1075         }
1076
1077         if (ro->ro_rt == NULL) {
1078                 if ((flags & IP_ROUTETOIF) == 0) {
1079                         printf("ip_output: can't update route after "
1080                             "IPsec processing\n");
1081                         error = EHOSTUNREACH;   /*XXX*/
1082                         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1083                             6,0,0,0,0);
1084                         goto bad;
1085                 }
1086         } else {
1087                 if (ia)
1088                         ifafree(&ia->ia_ifa);
1089                 RT_LOCK_SPIN(ro->ro_rt);
1090                 ia = ifatoia(ro->ro_rt->rt_ifa);
1091                 if (ia)
1092                         ifaref(&ia->ia_ifa);
1093                 ifp = ro->ro_rt->rt_ifp;
1094                 RT_UNLOCK(ro->ro_rt);
1095         }
1096
1097         /* make it flipped, again. */
1098
1099 #if BYTE_ORDER != BIG_ENDIAN
1100         NTOHS(ip->ip_len);
1101         NTOHS(ip->ip_off);
1102 #endif
1103
1104         KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff);
1105
1106         /* Pass to filters again */
1107         if (!TAILQ_EMPTY(&ipv4_filters)) {
1108                 struct ipfilter *filter;
1109
1110                 /* Check that a TSO frame isn't passed to a filter.
1111                  * This could happen if a filter is inserted while
1112                  * TCP is sending the TSO packet.
1113                  */
1114                 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1115                         error = EMSGSIZE;
1116                         goto bad;
1117                 }
1118
1119                 ipf_ref();
1120
1121                 /* 4135317 - always pass network byte order to filter */
1122
1123 #if BYTE_ORDER != BIG_ENDIAN
1124                 HTONS(ip->ip_len);
1125                 HTONS(ip->ip_off);
1126 #endif
1127
1128                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1129                         if (filter->ipf_filter.ipf_output) {
1130                                 errno_t result;
1131                                 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0);
1132                                 if (result == EJUSTRETURN) {
1133                                         ipf_unref();
1134                                         goto done;
1135                                 }
1136                                 if (result != 0) {
1137                                         ipf_unref();
1138                                         goto bad;
1139                                 }
1140                         }
1141                 }
1142
1143                 /* set back to host byte order */
1144                 ip = mtod(m, struct ip *);
1145
1146 #if BYTE_ORDER != BIG_ENDIAN
1147                 NTOHS(ip->ip_len);
1148                 NTOHS(ip->ip_off);
1149 #endif
1150
1151                 ipf_unref();
1152         }
1153 skip_ipsec:
1154 #endif /*IPSEC*/
1155
1156 #if IPFIREWALL
1157         /*
1158          * IpHack's section.
1159          * - Xlate: translate packet's addr/port (NAT).
1160          * - Firewall: deny/allow/etc.
1161          * - Wrap: fake packet's addr/port <unimpl.>
1162          * - Encapsulate: put it in another IP and send out. <unimp.>
1163          */
1164         if (fr_checkp) {
1165                 struct  mbuf    *m1 = m;
1166
1167                 if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) {
1168                         goto done;
1169                 }
1170                 ip = mtod(m0 = m = m1, struct ip *);
1171         }
1172
1173         /*
1174          * Check with the firewall...
1175          * but not if we are already being fwd'd from a firewall.
1176          */
1177         if (fw_enable && IPFW_LOADED && !args.next_hop) {
1178                 struct sockaddr_in *old = dst;
1179
1180                 args.m = m;
1181                 args.next_hop = dst;
1182                 args.oif = ifp;
1183                 off = ip_fw_chk_ptr(&args);
1184                 m = args.m;
1185                 dst = args.next_hop;
1186
1187                 /*
1188                  * On return we must do the following:
1189                  * IP_FW_PORT_DENY_FLAG         -> drop the pkt (XXX new)
1190                  * 1<=off<= 0xffff   -> DIVERT
1191                  * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1192                  * (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
1193                  * dst != old        -> IPFIREWALL_FORWARD
1194                  * off==0, dst==old  -> accept
1195                  * If some of the above modules is not compiled in, then
1196                  * we should't have to check the corresponding condition
1197                  * (because the ipfw control socket should not accept
1198                  * unsupported rules), but better play safe and drop
1199                  * packets in case of doubt.
1200                  */
1201                 m0 = m;
1202                 if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1203                         if (m)
1204                                 m_freem(m);
1205                         error = EACCES ;
1206                         goto done ;
1207                 }
1208                 ip = mtod(m, struct ip *);
1209
1210                 if (off == 0 && dst == old) {/* common case */
1211                         goto pass ;
1212                 }
1213 #if DUMMYNET
1214                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
1215                         /*
1216                          * pass the pkt to dummynet. Need to include
1217                          * pipe number, m, ifp, ro, dst because these are
1218                          * not recomputed in the next pass.
1219                          * All other parameters have been already used and
1220                          * so they are not needed anymore.
1221                          * XXX note: if the ifp or ro entry are deleted
1222                          * while a pkt is in dummynet, we are in trouble!
1223                          */
1224                         args.ro = ro;
1225                         args.dst = dst;
1226                         args.flags = flags;
1227                         if (flags & IP_OUTARGS)
1228                                 args.ipoa = ipoa;
1229
1230                         error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
1231                             &args);
1232                         goto done;
1233                 }
1234 #endif /* DUMMYNET */
1235 #if IPDIVERT
1236                 if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
1237                         struct mbuf *clone = NULL;
1238
1239                         /* Clone packet if we're doing a 'tee' */
1240                         if ((off & IP_FW_PORT_TEE_FLAG) != 0)
1241                                 clone = m_dup(m, M_DONTWAIT);
1242                         /*
1243                          * XXX
1244                          * delayed checksums are not currently compatible
1245                          * with divert sockets.
1246                          */
1247                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1248                                 in_delayed_cksum(m);
1249                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1250                         }
1251
1252                         /* Restore packet header fields to original values */
1253
1254 #if BYTE_ORDER != BIG_ENDIAN
1255                         HTONS(ip->ip_len);
1256                         HTONS(ip->ip_off);
1257 #endif
1258
1259                         /* Deliver packet to divert input routine */
1260                         divert_packet(m, 0, off & 0xffff, args.divert_rule);
1261
1262                         /* If 'tee', continue with original packet */
1263                         if (clone != NULL) {
1264                                 m0 = m = clone;
1265                                 ip = mtod(m, struct ip *);
1266                                 goto pass;
1267                         }
1268                         goto done;
1269                 }
1270 #endif
1271
1272 #if IPFIREWALL_FORWARD
1273                 /* Here we check dst to make sure it's directly reachable on the
1274                  * interface we previously thought it was.
1275                  * If it isn't (which may be likely in some situations) we have
1276                  * to re-route it (ie, find a route for the next-hop and the
1277                  * associated interface) and set them here. This is nested
1278                  * forwarding which in most cases is undesirable, except where
1279                  * such control is nigh impossible. So we do it here.
1280                  * And I'm babbling.
1281                  */
1282                 if (off == 0 && old != dst) {
1283                         struct in_ifaddr *ia_fw;
1284
1285                         /* It's changed... */
1286                         /* There must be a better way to do this next line... */
1287                         static struct route sro_fwd, *ro_fwd = &sro_fwd;
1288 #if IPFIREWALL_FORWARD_DEBUG
1289                         printf("IPFIREWALL_FORWARD: New dst ip: ");
1290                         print_ip(dst->sin_addr);
1291                         printf("\n");
1292 #endif
1293                         /*
1294                          * We need to figure out if we have been forwarded
1295                          * to a local socket. If so then we should somehow
1296                          * "loop back" to ip_input, and get directed to the
1297                          * PCB as if we had received this packet. This is
1298                          * because it may be dificult to identify the packets
1299                          * you want to forward until they are being output
1300                          * and have selected an interface. (e.g. locally
1301                          * initiated packets) If we used the loopback inteface,
1302                          * we would not be able to control what happens
1303                          * as the packet runs through ip_input() as
1304                          * it is done through a ISR.
1305                          */
1306                         lck_rw_lock_shared(in_ifaddr_rwlock);
1307                         TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1308                                 /*
1309                                  * If the addr to forward to is one
1310                                  * of ours, we pretend to
1311                                  * be the destination for this packet.
1312                                  */
1313                                 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1314                                                  dst->sin_addr.s_addr)
1315                                         break;
1316                         }
1317                         lck_rw_done(in_ifaddr_rwlock);
1318                         if (ia_fw) {
1319                                 /* tell ip_input "dont filter" */
1320                                 struct m_tag            *fwd_tag;
1321                                 struct ip_fwd_tag       *ipfwd_tag;
1322
1323                                 fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID,
1324                                     KERNEL_TAG_TYPE_IPFORWARD,
1325                                     sizeof (*ipfwd_tag), M_NOWAIT);
1326                                 if (fwd_tag == NULL) {
1327                                         error = ENOBUFS;
1328                                         goto bad;
1329                                 }
1330
1331                                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1332                                 ipfwd_tag->next_hop = args.next_hop;
1333
1334                                 m_tag_prepend(m, fwd_tag);
1335
1336                                 if (m->m_pkthdr.rcvif == NULL)
1337                                         m->m_pkthdr.rcvif = ifunit("lo0");
1338                                 if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) &
1339                                                 m->m_pkthdr.csum_flags) == 0) {
1340                                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1341                                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1342                                                 m->m_pkthdr.csum_flags |=
1343                                                         CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1344                                                 m->m_pkthdr.csum_data = 0xffff;
1345                                         }
1346                                         m->m_pkthdr.csum_flags |=
1347                                                 CSUM_IP_CHECKED | CSUM_IP_VALID;
1348                                 }
1349                                 else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1350                                         in_delayed_cksum(m);
1351                                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1352                                         ip->ip_sum = in_cksum(m, hlen);
1353                                 }
1354
1355 #if BYTE_ORDER != BIG_ENDIAN
1356                                 HTONS(ip->ip_len);
1357                                 HTONS(ip->ip_off);
1358 #endif
1359
1360                                 /*  we need to call dlil_output to run filters
1361                                  *      and resync to avoid recursion loops.
1362                                  */
1363                                 if (lo_ifp) {
1364                                         dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0);
1365                                 }
1366                                 else {
1367                                         printf("ip_output: no loopback ifp for forwarding!!!\n");
1368                                 }
1369                                 goto done;
1370                         }
1371                         /* Some of the logic for this was
1372                          * nicked from above.
1373                          *
1374                          * This rewrites the cached route in a local PCB.
1375                          * Is this what we want to do?
1376                          */
1377                         bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
1378
1379                         ro_fwd->ro_rt = NULL;
1380                         rtalloc_ign(ro_fwd, RTF_PRCLONING);
1381
1382                         if (ro_fwd->ro_rt == NULL) {
1383                                 OSAddAtomic(1, &ipstat.ips_noroute);
1384                                 error = EHOSTUNREACH;
1385                                 goto bad;
1386                         }
1387
1388                         RT_LOCK_SPIN(ro_fwd->ro_rt);
1389                         ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1390                         if (ia_fw != NULL)
1391                                 ifaref(&ia_fw->ia_ifa);
1392                         ifp = ro_fwd->ro_rt->rt_ifp;
1393                         ro_fwd->ro_rt->rt_use++;
1394                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1395                                 dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway;
1396                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1397                                 isbroadcast =
1398                                     (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1399                         } else {
1400                                 /* Become a regular mutex */
1401                                 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1402                                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
1403                         }
1404                         RT_UNLOCK(ro_fwd->ro_rt);
1405                         rtfree(ro->ro_rt);
1406                         ro->ro_rt = ro_fwd->ro_rt;
1407                         dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
1408
1409                         /*
1410                          * If we added a default src ip earlier,
1411                          * which would have been gotten from the-then
1412                          * interface, do it again, from the new one.
1413                          */
1414                         if (ia_fw != NULL) {
1415                                 if (fwd_rewrite_src)
1416                                         ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1417                                 ifafree(&ia_fw->ia_ifa);
1418                         }
1419                         goto pass ;
1420                 }
1421 #endif /* IPFIREWALL_FORWARD */
1422                 /*
1423                  * if we get here, none of the above matches, and
1424                  * we have to drop the pkt
1425                  */
1426                 m_freem(m);
1427                 error = EACCES; /* not sure this is the right error msg */
1428                 goto done;
1429         }
1430 #endif /* IPFIREWALL */
1431
1432 pass:
1433 #if __APPLE__
1434         /* Do not allow loopback address to wind up on a wire */
1435         if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
1436                  ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1437                   (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1438                 OSAddAtomic(1, &ipstat.ips_badaddr);
1439                 m_freem(m);
1440                 /*
1441                  * Do not simply drop the packet just like a firewall -- we want the
1442                  * the application to feel the pain.
1443                  * Return ENETUNREACH like ip6_output does in some similar cases.
1444                  * This can startle the otherwise clueless process that specifies
1445                  * loopback as the source address.
1446                  */
1447                 error = ENETUNREACH;
1448                 goto done;
1449         }
1450 #endif
1451         m->m_pkthdr.csum_flags |= CSUM_IP;
1452         tso =  (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4);
1453
1454         sw_csum = m->m_pkthdr.csum_flags
1455                 & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1456
1457         if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) {
1458                 /*
1459                  * Special case code for GMACE
1460                  * frames that can be checksumed by GMACE SUM16 HW:
1461                  * frame >64, no fragments, no UDP
1462                  */
1463                 if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP)
1464                         && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) {
1465                         /* Apple GMAC HW, expects STUFF_OFFSET << 16  | START_OFFSET */
1466                         u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */
1467                         u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF;
1468                         m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */
1469                         m->m_pkthdr.csum_data = (csumprev + offset)  << 16 ;
1470                         m->m_pkthdr.csum_data += offset;
1471                 sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */
1472                 }
1473                 else {
1474                         /* let the software handle any UDP or TCP checksums */
1475                         sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
1476                 }
1477         } else if (apple_hwcksum_tx == 0) {
1478                 sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
1479                     m->m_pkthdr.csum_flags;
1480         }
1481
1482         if (sw_csum & CSUM_DELAY_DATA) {
1483                 in_delayed_cksum(m);
1484                 sw_csum &= ~CSUM_DELAY_DATA;
1485                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1486         }
1487
1488         if (apple_hwcksum_tx != 0) {
1489                 m->m_pkthdr.csum_flags &=
1490                     IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1491         } else {
1492                 m->m_pkthdr.csum_flags = 0;
1493         }
1494
1495         /*
1496          * If small enough for interface, or the interface will take
1497          * care of the fragmentation for us, can just send directly.
1498          */
1499         if ((u_short)ip->ip_len <= ifp->if_mtu || tso ||
1500             ifp->if_hwassist & CSUM_FRAGMENT) {
1501                 if (tso)
1502                         m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
1503
1504
1505 #if BYTE_ORDER != BIG_ENDIAN
1506                 HTONS(ip->ip_len);
1507                 HTONS(ip->ip_off);
1508 #endif
1509
1510                 ip->ip_sum = 0;
1511                 if (sw_csum & CSUM_DELAY_IP) {
1512                         ip->ip_sum = in_cksum(m, hlen);
1513                 }
1514
1515 #ifndef __APPLE__
1516                 /* Record statistics for this interface address. */
1517                 if (!(flags & IP_FORWARDING) && ia != NULL) {
1518                         ia->ia_ifa.if_opackets++;
1519                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1520                 }
1521 #endif
1522
1523 #if IPSEC
1524                 /* clean ipsec history once it goes out of the node */
1525                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1526                         ipsec_delaux(m);
1527 #endif
1528                 if (packetchain == 0) {
1529                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1530                             (struct sockaddr *)dst);
1531                         goto done;
1532                 }
1533                 else { /* packet chaining allows us to reuse the route for all packets */
1534                         mppn = &m->m_nextpkt;
1535                         m = m->m_nextpkt;
1536                         if (m == NULL) {
1537 #if PF
1538 sendchain:
1539 #endif /* PF */
1540                                 if (pktcnt > ip_maxchainsent)
1541                                         ip_maxchainsent = pktcnt;
1542                                 //send
1543                                 error = ifnet_output(ifp, PF_INET, packetlist,
1544                                     ro->ro_rt, (struct sockaddr *)dst);
1545                                 pktcnt = 0;
1546                                 goto done;
1547
1548                         }
1549                         m0 = m;
1550                         pktcnt++;
1551                         goto loopit;
1552                 }
1553         }
1554         /*
1555          * Too large for interface; fragment if possible.
1556          * Must be able to put at least 8 bytes per fragment.
1557          */
1558
1559         if (ip->ip_off & IP_DF  || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1560                 error = EMSGSIZE;
1561                 /*
1562                  * This case can happen if the user changed the MTU
1563                  *
1564                  * of an interface after enabling IP on it.  Because
1565                  * most netifs don't keep track of routes pointing to
1566                  * them, there is no way for one to update all its
1567                  * routes when the MTU is changed.
1568                  */
1569                 RT_LOCK_SPIN(ro->ro_rt);
1570                 if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
1571                     && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
1572                     && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1573                         ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1574                 }
1575                 RT_UNLOCK(ro->ro_rt);
1576                 OSAddAtomic(1, &ipstat.ips_cantfrag);
1577                 goto bad;
1578         }
1579
1580         error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
1581         if (error != 0) {
1582                 m0 = m = NULL;
1583                 goto bad;
1584         }
1585
1586         KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1587                      ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1588
1589         for (m = m0; m; m = m0) {
1590                 m0 = m->m_nextpkt;
1591                 m->m_nextpkt = 0;
1592 #if IPSEC
1593                 /* clean ipsec history once it goes out of the node */
1594                 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1595                         ipsec_delaux(m);
1596 #endif
1597                 if (error == 0) {
1598 #ifndef __APPLE__
1599                         /* Record statistics for this interface address. */
1600                         if (ia != NULL) {
1601                                 ia->ia_ifa.if_opackets++;
1602                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1603                         }
1604 #endif
1605                         if ((packetchain != 0)  && (pktcnt > 0))
1606                                 panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist);
1607                         error = ifnet_output(ifp, PF_INET, m, ro->ro_rt,
1608                             (struct sockaddr *)dst);
1609                 } else
1610                         m_freem(m);
1611         }
1612
1613         if (error == 0)
1614                 OSAddAtomic(1, &ipstat.ips_fragmented);
1615
1616 done:
1617         if (ia) {
1618                 ifafree(&ia->ia_ifa);
1619                 ia = NULL;
1620         }
1621 #if IPSEC
1622         if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
1623         if (ro == &iproute && ro->ro_rt) {
1624                 rtfree(ro->ro_rt);
1625                 ro->ro_rt = NULL;
1626         }
1627         if (sp != NULL) {
1628                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1629                         printf("DP ip_output call free SP:%x\n", sp));
1630                 key_freesp(sp, KEY_SADB_UNLOCKED);
1631         }
1632         }
1633 #endif /* IPSEC */
1634
1635         KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0);
1636         return (error);
1637 bad:
1638         m_freem(m0);
1639         goto done;
1640 }
1641
1642 int
1643 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
1644 {
1645         struct ip *ip, *mhip;
1646         int len, hlen, mhlen, firstlen, off, error = 0;
1647         struct mbuf **mnext = &m->m_nextpkt, *m0;
1648         int nfrags = 1;
1649
1650         ip = mtod(m, struct ip *);
1651 #ifdef _IP_VHL
1652         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1653 #else
1654         hlen = ip->ip_hl << 2;
1655 #endif
1656
1657         firstlen = len = (mtu - hlen) &~ 7;
1658         if (len < 8) {
1659                 m_freem(m);
1660                 return (EMSGSIZE);
1661         }
1662
1663         /*
1664          * if the interface will not calculate checksums on
1665          * fragmented packets, then do it here.
1666          */
1667         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1668             (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
1669                 in_delayed_cksum(m);
1670                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1671         }
1672
1673         /*
1674          * Loop through length of segment after first fragment,
1675          * make new header and copy data of each part and link onto chain.
1676          */
1677         m0 = m;
1678         mhlen = sizeof (struct ip);
1679         for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1680                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1681                 if (m == 0) {
1682                         error = ENOBUFS;
1683                         OSAddAtomic(1, &ipstat.ips_odropped);
1684                         goto sendorfree;
1685                 }
1686                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1687                 m->m_data += max_linkhdr;
1688                 mhip = mtod(m, struct ip *);
1689                 *mhip = *ip;
1690                 if (hlen > sizeof (struct ip)) {
1691                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1692                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1693                 }
1694                 m->m_len = mhlen;
1695                 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
1696                 if (ip->ip_off & IP_MF)
1697                         mhip->ip_off |= IP_MF;
1698                 if (off + len >= (u_short)ip->ip_len)
1699                         len = (u_short)ip->ip_len - off;
1700                 else
1701                         mhip->ip_off |= IP_MF;
1702                 mhip->ip_len = htons((u_short)(len + mhlen));
1703                 m->m_next = m_copy(m0, off, len);
1704                 if (m->m_next == 0) {
1705                         (void) m_free(m);
1706                         error = ENOBUFS;        /* ??? */
1707                         OSAddAtomic(1, &ipstat.ips_odropped);
1708                         goto sendorfree;
1709                 }
1710                 m->m_pkthdr.len = mhlen + len;
1711                 m->m_pkthdr.rcvif = 0;
1712                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1713                 m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id;
1714 #if CONFIG_MACF_NET
1715                 mac_netinet_fragment(m0, m);
1716 #endif
1717
1718 #if BYTE_ORDER != BIG_ENDIAN
1719                 HTONS(mhip->ip_off);
1720 #endif
1721
1722                 mhip->ip_sum = 0;
1723                 if (sw_csum & CSUM_DELAY_IP) {
1724                         mhip->ip_sum = in_cksum(m, mhlen);
1725                 }
1726                 *mnext = m;
1727                 mnext = &m->m_nextpkt;
1728                 nfrags++;
1729         }
1730         OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1731
1732         /* set first/last markers for fragment chain */
1733         m->m_flags |= M_LASTFRAG;
1734         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1735         m0->m_pkthdr.csum_data = nfrags;
1736
1737         /*
1738          * Update first fragment by trimming what's been copied out
1739          * and updating header, then send each fragment (in order).
1740          */
1741         m = m0;
1742         m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1743         m->m_pkthdr.len = hlen + firstlen;
1744         ip->ip_len = htons((u_short)m->m_pkthdr.len);
1745         ip->ip_off |= IP_MF;
1746
1747 #if BYTE_ORDER != BIG_ENDIAN
1748         HTONS(ip->ip_off);
1749 #endif
1750
1751         ip->ip_sum = 0;
1752         if (sw_csum & CSUM_DELAY_IP) {
1753                 ip->ip_sum = in_cksum(m, hlen);
1754         }
1755 sendorfree:
1756         if (error)
1757                 m_freem_list(m0);
1758
1759         return (error);
1760 }
1761
1762 static void
1763 ip_out_cksum_stats(int proto, u_int32_t len)
1764 {
1765         switch (proto) {
1766         case IPPROTO_TCP:
1767                 tcp_out_cksum_stats(len);
1768                 break;
1769         case IPPROTO_UDP:
1770                 udp_out_cksum_stats(len);
1771                 break;
1772         default:
1773                 /* keep only TCP or UDP stats for now */
1774                 break;
1775         }
1776 }
1777
1778 void
1779 in_delayed_cksum_offset(struct mbuf *m0, int ip_offset)
1780 {
1781         struct ip *ip;
1782         unsigned char buf[sizeof(struct ip)];
1783         u_short csum, offset, ip_len;
1784         struct mbuf *m = m0;
1785
1786         while (ip_offset >= m->m_len) {
1787                 ip_offset -= m->m_len;
1788                 m = m->m_next;
1789                 if (m == NULL) {
1790                         printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n");
1791                         return;
1792                 }
1793         }
1794
1795         /* Sometimes the IP header is not contiguous, yes this can happen! */
1796         if (ip_offset + sizeof(struct ip) > m->m_len) {
1797 #if DEBUG
1798                 printf("delayed m_pullup, m->len: %d  off: %d\n",
1799                         m->m_len, ip_offset);
1800 #endif
1801                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
1802
1803                 ip = (struct ip *)buf;
1804         } else {
1805                 ip = (struct ip*)(m->m_data + ip_offset);
1806         }
1807
1808         /* Gross */
1809         if (ip_offset) {
1810                 m->m_len -= ip_offset;
1811                 m->m_data += ip_offset;
1812         }
1813
1814         offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
1815
1816         /*
1817          * We could be in the context of an IP or interface filter; in the
1818          * former case, ip_len would be in host (correct) order while for
1819          * the latter it would be in network order.  Because of this, we
1820          * attempt to interpret the length field by comparing it against
1821          * the actual packet length.  If the comparison fails, byte swap
1822          * the length and check again.  If it still fails, then the packet
1823          * is bogus and we give up.
1824          */
1825         ip_len = ip->ip_len;
1826         if (ip_len != (m0->m_pkthdr.len - ip_offset)) {
1827                 ip_len = SWAP16(ip_len);
1828                 if (ip_len != (m0->m_pkthdr.len - ip_offset)) {
1829                         printf("in_delayed_cksum_offset: ip_len %d (%d) "
1830                             "doesn't match actual length %d\n", ip->ip_len,
1831                             ip_len, (m0->m_pkthdr.len - ip_offset));
1832                         return;
1833                 }
1834         }
1835
1836         csum = in_cksum_skip(m, ip_len, offset);
1837
1838         /* Update stats */
1839         ip_out_cksum_stats(ip->ip_p, ip_len - offset);
1840
1841         if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1842                 csum = 0xffff;
1843         offset += m0->m_pkthdr.csum_data & 0xFFFF;        /* checksum offset */
1844
1845         /* Gross */
1846         if (ip_offset) {
1847                 if (M_LEADINGSPACE(m) < ip_offset)
1848                         panic("in_delayed_cksum_offset - chain modified!\n");
1849                 m->m_len += ip_offset;
1850                 m->m_data -= ip_offset;
1851         }
1852
1853         if (offset > ip_len) /* bogus offset */
1854                 return;
1855
1856         /* Insert the checksum in the existing chain */
1857         if (offset + ip_offset + sizeof(u_short) > m->m_len) {
1858                 char tmp[2];
1859
1860 #if DEBUG
1861                 printf("delayed m_copyback, m->len: %d  off: %d  p: %d\n",
1862                     m->m_len, offset + ip_offset, ip->ip_p);
1863 #endif
1864                 *(u_short *)tmp = csum;
1865                 m_copyback(m, offset + ip_offset, 2, tmp);
1866         } else
1867                 *(u_short *)(m->m_data + offset + ip_offset) = csum;
1868 }
1869
1870 void
1871 in_delayed_cksum(struct mbuf *m)
1872 {
1873         in_delayed_cksum_offset(m, 0);
1874 }
1875
1876 void
1877 in_cksum_offset(struct mbuf* m, size_t ip_offset)
1878 {
1879         struct ip* ip = NULL;
1880         int hlen = 0;
1881         unsigned char buf[sizeof(struct ip)];
1882         int swapped = 0;
1883
1884         while (ip_offset >= m->m_len) {
1885                 ip_offset -= m->m_len;
1886                 m = m->m_next;
1887                 if (m == NULL) {
1888                         printf("in_cksum_offset failed - ip_offset wasn't in the packet\n");
1889                         return;
1890                 }
1891         }
1892
1893         /* Sometimes the IP header is not contiguous, yes this can happen! */
1894         if (ip_offset + sizeof(struct ip) > m->m_len) {
1895
1896 #if DEBUG
1897                 printf("in_cksum_offset - delayed m_pullup, m->len: %d  off: %lu\n",
1898                         m->m_len, ip_offset);
1899 #endif
1900                 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
1901
1902                 ip = (struct ip *)buf;
1903                 ip->ip_sum = 0;
1904                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum);
1905         } else {
1906                 ip = (struct ip*)(m->m_data + ip_offset);
1907                 ip->ip_sum = 0;
1908         }
1909
1910         /* Gross */
1911         if (ip_offset) {
1912                 m->m_len -= ip_offset;
1913                 m->m_data += ip_offset;
1914         }
1915
1916 #ifdef _IP_VHL
1917         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1918 #else
1919         hlen = ip->ip_hl << 2;
1920 #endif
1921         /*
1922          * We could be in the context of an IP or interface filter; in the
1923          * former case, ip_len would be in host order while for the latter
1924          * it would be in network (correct) order.  Because of this, we
1925          * attempt to interpret the length field by comparing it against
1926          * the actual packet length.  If the comparison fails, byte swap
1927          * the length and check again.  If it still fails, then the packet
1928          * is bogus and we give up.
1929          */
1930         if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) {
1931                 ip->ip_len = SWAP16(ip->ip_len);
1932                 swapped = 1;
1933                 if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) {
1934                         ip->ip_len = SWAP16(ip->ip_len);
1935                         printf("in_cksum_offset: ip_len %d (%d) "
1936                             "doesn't match actual length %lu\n",
1937                             ip->ip_len, SWAP16(ip->ip_len),
1938                             (m->m_pkthdr.len - ip_offset));
1939                         return;
1940                 }
1941         }
1942
1943         ip->ip_sum = 0;
1944         ip->ip_sum = in_cksum(m, hlen);
1945         if (swapped)
1946                 ip->ip_len = SWAP16(ip->ip_len);
1947
1948         /* Gross */
1949         if (ip_offset) {
1950                 if (M_LEADINGSPACE(m) < ip_offset)
1951                         panic("in_cksum_offset - chain modified!\n");
1952                 m->m_len += ip_offset;
1953                 m->m_data -= ip_offset;
1954         }
1955
1956         /* Insert the checksum in the existing chain if IP header not contiguous */
1957         if (ip_offset + sizeof(struct ip) > m->m_len) {
1958                 char tmp[2];
1959
1960 #if DEBUG
1961                 printf("in_cksum_offset m_copyback, m->len: %u  off: %lu  p: %d\n",
1962                     m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p);
1963 #endif
1964                 *(u_short *)tmp = ip->ip_sum;
1965                 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp);
1966         }
1967 }
1968
1969 /*
1970  * Insert IP options into preformed packet.
1971  * Adjust IP destination as required for IP source routing,
1972  * as indicated by a non-zero in_addr at the start of the options.
1973  *
1974  * XXX This routine assumes that the packet has no options in place.
1975  */
1976 static struct mbuf *
1977 ip_insertoptions(m, opt, phlen)
1978         register struct mbuf *m;
1979         struct mbuf *opt;
1980         int *phlen;
1981 {
1982         register struct ipoption *p = mtod(opt, struct ipoption *);
1983         struct mbuf *n;
1984         register struct ip *ip = mtod(m, struct ip *);
1985         unsigned optlen;
1986
1987         optlen = opt->m_len - sizeof(p->ipopt_dst);
1988         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
1989                 return (m);             /* XXX should fail */
1990         if (p->ipopt_dst.s_addr)
1991                 ip->ip_dst = p->ipopt_dst;
1992         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1993                 MGETHDR(n, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1994                 if (n == 0)
1995                         return (m);
1996                 n->m_pkthdr.rcvif = 0;
1997 #if CONFIG_MACF_NET
1998                 mac_mbuf_label_copy(m, n);
1999 #endif
2000                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2001                 m->m_len -= sizeof(struct ip);
2002                 m->m_data += sizeof(struct ip);
2003                 n->m_next = m;
2004                 m = n;
2005                 m->m_len = optlen + sizeof(struct ip);
2006                 m->m_data += max_linkhdr;
2007                 (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
2008         } else {
2009                 m->m_data -= optlen;
2010                 m->m_len += optlen;
2011                 m->m_pkthdr.len += optlen;
2012                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2013         }
2014         ip = mtod(m, struct ip *);
2015         bcopy(p->ipopt_list, ip + 1, optlen);
2016         *phlen = sizeof(struct ip) + optlen;
2017         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2018         ip->ip_len += optlen;
2019         return (m);
2020 }
2021
2022 /*
2023  * Copy options from ip to jp,
2024  * omitting those not copied during fragmentation.
2025  */
2026 int
2027 ip_optcopy(ip, jp)
2028         struct ip *ip, *jp;
2029 {
2030         register u_char *cp, *dp;
2031         int opt, optlen, cnt;
2032
2033         cp = (u_char *)(ip + 1);
2034         dp = (u_char *)(jp + 1);
2035         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2036         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2037                 opt = cp[0];
2038                 if (opt == IPOPT_EOL)
2039                         break;
2040                 if (opt == IPOPT_NOP) {
2041                         /* Preserve for IP mcast tunnel's LSRR alignment. */
2042                         *dp++ = IPOPT_NOP;
2043                         optlen = 1;
2044                         continue;
2045                 }
2046 #if DIAGNOSTIC
2047                 if (cnt < IPOPT_OLEN + sizeof(*cp))
2048                         panic("malformed IPv4 option passed to ip_optcopy");
2049 #endif
2050                 optlen = cp[IPOPT_OLEN];
2051 #if DIAGNOSTIC
2052                 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2053                         panic("malformed IPv4 option passed to ip_optcopy");
2054 #endif
2055                 /* bogus lengths should have been caught by ip_dooptions */
2056                 if (optlen > cnt)
2057                         optlen = cnt;
2058                 if (IPOPT_COPIED(opt)) {
2059                         bcopy(cp, dp, optlen);
2060                         dp += optlen;
2061                 }
2062         }
2063         for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2064                 *dp++ = IPOPT_EOL;
2065         return (optlen);
2066 }
2067
2068 /*
2069  * IP socket option processing.
2070  */
2071 int
2072 ip_ctloutput(so, sopt)
2073         struct socket *so;
2074         struct sockopt *sopt;
2075 {
2076         struct  inpcb *inp = sotoinpcb(so);
2077         int     error, optval;
2078
2079         error = optval = 0;
2080         if (sopt->sopt_level != IPPROTO_IP) {
2081                 return (EINVAL);
2082         }
2083
2084         switch (sopt->sopt_dir) {
2085         case SOPT_SET:
2086                 switch (sopt->sopt_name) {
2087                 case IP_OPTIONS:
2088 #ifdef notyet
2089                 case IP_RETOPTS:
2090 #endif
2091                 {
2092                         struct mbuf *m;
2093                         if (sopt->sopt_valsize > MLEN) {
2094                                 error = EMSGSIZE;
2095                                 break;
2096                         }
2097                         MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2098                             MT_HEADER);
2099                         if (m == 0) {
2100                                 error = ENOBUFS;
2101                                 break;
2102                         }
2103                         m->m_len = sopt->sopt_valsize;
2104                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
2105                                             m->m_len);
2106                         if (error)
2107                                 break;
2108
2109                         return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
2110                                            m));
2111                 }
2112
2113                 case IP_TOS:
2114                 case IP_TTL:
2115                 case IP_RECVOPTS:
2116                 case IP_RECVRETOPTS:
2117                 case IP_RECVDSTADDR:
2118                 case IP_RECVIF:
2119                 case IP_RECVTTL:
2120 #if defined(NFAITH) && NFAITH > 0
2121                 case IP_FAITH:
2122 #endif
2123                         error = sooptcopyin(sopt, &optval, sizeof optval,
2124                                             sizeof optval);
2125                         if (error)
2126                                 break;
2127
2128                         switch (sopt->sopt_name) {
2129                         case IP_TOS:
2130                                 inp->inp_ip_tos = optval;
2131                                 break;
2132
2133                         case IP_TTL:
2134                                 inp->inp_ip_ttl = optval;
2135                                 break;
2136 #define OPTSET(bit) \
2137         if (optval) \
2138                 inp->inp_flags |= bit; \
2139         else \
2140                 inp->inp_flags &= ~bit;
2141
2142                         case IP_RECVOPTS:
2143                                 OPTSET(INP_RECVOPTS);
2144                                 break;
2145
2146                         case IP_RECVRETOPTS:
2147                                 OPTSET(INP_RECVRETOPTS);
2148                                 break;
2149
2150                         case IP_RECVDSTADDR:
2151                                 OPTSET(INP_RECVDSTADDR);
2152                                 break;
2153
2154                         case IP_RECVIF:
2155                                 OPTSET(INP_RECVIF);
2156                                 break;
2157
2158                         case IP_RECVTTL:
2159                                 OPTSET(INP_RECVTTL);
2160                                 break;
2161
2162 #if defined(NFAITH) && NFAITH > 0
2163                         case IP_FAITH:
2164                                 OPTSET(INP_FAITH);
2165                                 break;
2166 #endif
2167                         }
2168                         break;
2169 #undef OPTSET
2170
2171 #if CONFIG_FORCE_OUT_IFP
2172                 /*
2173                  * Apple private interface, similar to IP_BOUND_IF, except
2174                  * that the parameter is a NULL-terminated string containing
2175                  * the name of the network interface; an emptry string means
2176                  * unbind.  Applications are encouraged to use IP_BOUND_IF
2177                  * instead, as that is the current "official" API.
2178                  */
2179                 case IP_FORCE_OUT_IFP: {
2180                         char ifname[IFNAMSIZ];
2181                         unsigned int ifscope;
2182
2183                         /* This option is settable only for IPv4 */
2184                         if (!(inp->inp_vflag & INP_IPV4)) {
2185                                 error = EINVAL;
2186                                 break;
2187                         }
2188
2189                         /* Verify interface name parameter is sane */
2190                         if (sopt->sopt_valsize > sizeof(ifname)) {
2191                                 error = EINVAL;
2192                                 break;
2193                         }
2194
2195                         /* Copy the interface name */
2196                         if (sopt->sopt_valsize != 0) {
2197                                 error = sooptcopyin(sopt, ifname,
2198                                     sizeof (ifname), sopt->sopt_valsize);
2199                                 if (error)
2200                                         break;
2201                         }
2202
2203                         if (sopt->sopt_valsize == 0 || ifname[0] == NULL) {
2204                                 /* Unbind this socket from any interface */
2205                                 ifscope = IFSCOPE_NONE;
2206                         } else {
2207                                 ifnet_t ifp;
2208
2209                                 /* Verify name is NULL terminated */
2210                                 if (ifname[sopt->sopt_valsize - 1] != NULL) {
2211                                         error = EINVAL;
2212                                         break;
2213                                 }
2214
2215                                 /* Bail out if given bogus interface name */
2216                                 if (ifnet_find_by_name(ifname, &ifp) != 0) {
2217                                         error = ENXIO;
2218                                         break;
2219                                 }
2220
2221                                 /* Bind this socket to this interface */
2222                                 ifscope = ifp->if_index;
2223
2224                                 /*
2225                                  * Won't actually free; since we don't release
2226                                  * this later, we should do it now.
2227                                  */
2228                                 ifnet_release(ifp);
2229                         }
2230                         ip_bindif(inp, ifscope);
2231                 }
2232                 break;
2233 #endif
2234                 case IP_MULTICAST_IF:
2235                 case IP_MULTICAST_VIF:
2236                 case IP_MULTICAST_TTL:
2237                 case IP_MULTICAST_LOOP:
2238                 case IP_ADD_MEMBERSHIP:
2239                 case IP_DROP_MEMBERSHIP:
2240                         error = ip_setmoptions(sopt, &inp->inp_moptions);
2241                         break;
2242
2243                 case IP_PORTRANGE:
2244                         error = sooptcopyin(sopt, &optval, sizeof optval,
2245                                             sizeof optval);
2246                         if (error)
2247                                 break;
2248
2249                         switch (optval) {
2250                         case IP_PORTRANGE_DEFAULT:
2251                                 inp->inp_flags &= ~(INP_LOWPORT);
2252                                 inp->inp_flags &= ~(INP_HIGHPORT);
2253                                 break;
2254
2255                         case IP_PORTRANGE_HIGH:
2256                                 inp->inp_flags &= ~(INP_LOWPORT);
2257                                 inp->inp_flags |= INP_HIGHPORT;
2258                                 break;
2259
2260                         case IP_PORTRANGE_LOW:
2261                                 inp->inp_flags &= ~(INP_HIGHPORT);
2262                                 inp->inp_flags |= INP_LOWPORT;
2263                                 break;
2264
2265                         default:
2266                                 error = EINVAL;
2267                                 break;
2268                         }
2269                         break;
2270
2271 #if IPSEC
2272                 case IP_IPSEC_POLICY:
2273                 {
2274                         caddr_t req = NULL;
2275                         size_t len = 0;
2276                         int priv;
2277                         struct mbuf *m;
2278                         int optname;
2279
2280                         if (sopt->sopt_valsize > MCLBYTES) {
2281                                 error = EMSGSIZE;
2282                                 break;
2283                         }
2284                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2285                                 break;
2286                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2287                                 break;
2288                         priv = (proc_suser(sopt->sopt_p) == 0);
2289                         if (m) {
2290                                 req = mtod(m, caddr_t);
2291                                 len = m->m_len;
2292                         }
2293                         optname = sopt->sopt_name;
2294                         error = ipsec4_set_policy(inp, optname, req, len, priv);
2295                         m_freem(m);
2296                         break;
2297                 }
2298 #endif /*IPSEC*/
2299
2300 #if TRAFFIC_MGT
2301                 case IP_TRAFFIC_MGT_BACKGROUND:
2302                 {
2303                         unsigned        background = 0;
2304                         error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background));
2305                         if (error)
2306                                 break;
2307
2308                         if (background) {
2309                                 socket_set_traffic_mgt_flags(so,
2310                                     TRAFFIC_MGT_SO_BACKGROUND |
2311                                     TRAFFIC_MGT_SO_BG_REGULATE);
2312                         } else {
2313                                 socket_clear_traffic_mgt_flags(so,
2314                                     TRAFFIC_MGT_SO_BACKGROUND |
2315                                     TRAFFIC_MGT_SO_BG_REGULATE);
2316                         }
2317
2318                         break;
2319                 }
2320 #endif /* TRAFFIC_MGT */
2321
2322                 /*
2323                  * On a multihomed system, scoped routing can be used to
2324                  * restrict the source interface used for sending packets.
2325                  * The socket option IP_BOUND_IF binds a particular AF_INET
2326                  * socket to an interface such that data sent on the socket
2327                  * is restricted to that interface.  This is unlike the
2328                  * SO_DONTROUTE option where the routing table is bypassed;
2329                  * therefore it allows for a greater flexibility and control
2330                  * over the system behavior, and does not place any restriction
2331                  * on the destination address type (e.g.  unicast, multicast,
2332                  * or broadcast if applicable) or whether or not the host is
2333                  * directly reachable.  Note that in the multicast transmit
2334                  * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF,
2335                  * since the former practically bypasses the routing table;
2336                  * in this case, IP_BOUND_IF sets the default interface used
2337                  * for sending multicast packets in the absence of an explicit
2338                  * transmit interface set via IP_MULTICAST_IF.
2339                  */
2340                 case IP_BOUND_IF:
2341                         /* This option is settable only for IPv4 */
2342                         if (!(inp->inp_vflag & INP_IPV4)) {
2343                                 error = EINVAL;
2344                                 break;
2345                         }
2346
2347                         error = sooptcopyin(sopt, &optval, sizeof (optval),
2348                             sizeof (optval));
2349
2350                         if (error)
2351                                 break;
2352
2353                         ip_bindif(inp, optval);
2354                         break;
2355
2356                 default:
2357                         error = ENOPROTOOPT;
2358                         break;
2359                 }
2360                 break;
2361
2362         case SOPT_GET:
2363                 switch (sopt->sopt_name) {
2364                 case IP_OPTIONS:
2365                 case IP_RETOPTS:
2366                         if (inp->inp_options)
2367                                 error = sooptcopyout(sopt,
2368                                                      mtod(inp->inp_options,
2369                                                           char *),
2370                                                      inp->inp_options->m_len);
2371                         else
2372                                 sopt->sopt_valsize = 0;
2373                         break;
2374
2375                 case IP_TOS:
2376                 case IP_TTL:
2377                 case IP_RECVOPTS:
2378                 case IP_RECVRETOPTS:
2379                 case IP_RECVDSTADDR:
2380                 case IP_RECVIF:
2381                 case IP_RECVTTL:
2382                 case IP_PORTRANGE:
2383 #if defined(NFAITH) && NFAITH > 0
2384                 case IP_FAITH:
2385 #endif
2386                         switch (sopt->sopt_name) {
2387
2388                         case IP_TOS:
2389                                 optval = inp->inp_ip_tos;
2390                                 break;
2391
2392                         case IP_TTL:
2393                                 optval = inp->inp_ip_ttl;
2394                                 break;
2395
2396 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
2397
2398                         case IP_RECVOPTS:
2399                                 optval = OPTBIT(INP_RECVOPTS);
2400                                 break;
2401
2402                         case IP_RECVRETOPTS:
2403                                 optval = OPTBIT(INP_RECVRETOPTS);
2404                                 break;
2405
2406                         case IP_RECVDSTADDR:
2407                                 optval = OPTBIT(INP_RECVDSTADDR);
2408                                 break;
2409
2410                         case IP_RECVIF:
2411                                 optval = OPTBIT(INP_RECVIF);
2412                                 break;
2413
2414                         case IP_RECVTTL:
2415                                 optval = OPTBIT(INP_RECVTTL);
2416                                 break;
2417
2418                         case IP_PORTRANGE:
2419                                 if (inp->inp_flags & INP_HIGHPORT)
2420                                         optval = IP_PORTRANGE_HIGH;
2421                                 else if (inp->inp_flags & INP_LOWPORT)
2422                                         optval = IP_PORTRANGE_LOW;
2423                                 else
2424                                         optval = 0;
2425                                 break;
2426
2427 #if defined(NFAITH) && NFAITH > 0
2428                         case IP_FAITH:
2429                                 optval = OPTBIT(INP_FAITH);
2430                                 break;
2431 #endif
2432                         }
2433                         error = sooptcopyout(sopt, &optval, sizeof optval);
2434                         break;
2435
2436                 case IP_MULTICAST_IF:
2437                 case IP_MULTICAST_VIF:
2438                 case IP_MULTICAST_TTL:
2439                 case IP_MULTICAST_LOOP:
2440                 case IP_ADD_MEMBERSHIP:
2441                 case IP_DROP_MEMBERSHIP:
2442                         error = ip_getmoptions(sopt, inp->inp_moptions);
2443                         break;
2444
2445 #if IPSEC
2446                 case IP_IPSEC_POLICY:
2447                 {
2448                         struct mbuf *m = NULL;
2449                         caddr_t req = NULL;
2450                         size_t len = 0;
2451
2452                         if (m != 0) {
2453                                 req = mtod(m, caddr_t);
2454                                 len = m->m_len;
2455                         }
2456                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
2457                         if (error == 0)
2458                                 error = soopt_mcopyout(sopt, m); /* XXX */
2459                         if (error == 0)
2460                                 m_freem(m);
2461                         break;
2462                 }
2463 #endif /*IPSEC*/
2464
2465 #if TRAFFIC_MGT
2466                 case IP_TRAFFIC_MGT_BACKGROUND:
2467                 {
2468                         unsigned        background = so->so_traffic_mgt_flags;
2469                         return (sooptcopyout(sopt, &background, sizeof(background)));
2470                         break;
2471                 }
2472 #endif /* TRAFFIC_MGT */
2473
2474                 case IP_BOUND_IF:
2475                         if (inp->inp_flags & INP_BOUND_IF)
2476                                 optval = inp->inp_boundif;
2477                         error = sooptcopyout(sopt, &optval, sizeof (optval));
2478                         break;
2479
2480                 default:
2481                         error = ENOPROTOOPT;
2482                         break;
2483                 }
2484                 break;
2485         }
2486         return (error);
2487 }
2488
2489 /*
2490  * Set up IP options in pcb for insertion in output packets.
2491  * Store in mbuf with pointer in pcbopt, adding pseudo-option
2492  * with destination address if source routed.
2493  */
2494 static int
2495 ip_pcbopts(
2496         __unused int optname,
2497         struct mbuf **pcbopt,
2498         register struct mbuf *m)
2499 {
2500         register int cnt, optlen;
2501         register u_char *cp;
2502         u_char opt;
2503
2504         /* turn off any old options */
2505         if (*pcbopt)
2506                 (void)m_free(*pcbopt);
2507         *pcbopt = 0;
2508         if (m == (struct mbuf *)0 || m->m_len == 0) {
2509                 /*
2510                  * Only turning off any previous options.
2511                  */
2512                 if (m)
2513                         (void)m_free(m);
2514                 return (0);
2515         }
2516
2517 #ifndef vax
2518         if (m->m_len % sizeof(int32_t))
2519                 goto bad;
2520 #endif
2521         /*
2522          * IP first-hop destination address will be stored before
2523          * actual options; move other options back
2524          * and clear it when none present.
2525          */
2526         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
2527                 goto bad;
2528         cnt = m->m_len;
2529         m->m_len += sizeof(struct in_addr);
2530         cp = mtod(m, u_char *) + sizeof(struct in_addr);
2531         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2532         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2533
2534         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2535                 opt = cp[IPOPT_OPTVAL];
2536                 if (opt == IPOPT_EOL)
2537                         break;
2538                 if (opt == IPOPT_NOP)
2539                         optlen = 1;
2540                 else {
2541                         if (cnt < IPOPT_OLEN + sizeof(*cp))
2542                                 goto bad;
2543                         optlen = cp[IPOPT_OLEN];
2544                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2545                                 goto bad;
2546                 }
2547                 switch (opt) {
2548
2549                 default:
2550                         break;
2551
2552                 case IPOPT_LSRR:
2553                 case IPOPT_SSRR:
2554                         /*
2555                          * user process specifies route as:
2556                          *      ->A->B->C->D
2557                          * D must be our final destination (but we can't
2558                          * check that since we may not have connected yet).
2559                          * A is first hop destination, which doesn't appear in
2560                          * actual IP option, but is stored before the options.
2561                          */
2562                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
2563                                 goto bad;
2564                         m->m_len -= sizeof(struct in_addr);
2565                         cnt -= sizeof(struct in_addr);
2566                         optlen -= sizeof(struct in_addr);
2567                         cp[IPOPT_OLEN] = optlen;
2568                         /*
2569                          * Move first hop before start of options.
2570                          */
2571                         bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2572                             sizeof(struct in_addr));
2573                         /*
2574                          * Then copy rest of options back
2575                          * to close up the deleted entry.
2576                          */
2577                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2578                             sizeof(struct in_addr)),
2579                             (caddr_t)&cp[IPOPT_OFFSET+1],
2580                             (unsigned)cnt + sizeof(struct in_addr));
2581                         break;
2582                 }
2583         }
2584         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
2585                 goto bad;
2586         *pcbopt = m;
2587         return (0);
2588
2589 bad:
2590         (void)m_free(m);
2591         return (EINVAL);
2592 }
2593
2594 /*
2595  * XXX
2596  * The whole multicast option thing needs to be re-thought.
2597  * Several of these options are equally applicable to non-multicast
2598  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
2599  * standard option (IP_TTL).
2600  */
2601
2602 /*
2603  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
2604  */
2605 static struct ifnet *
2606 ip_multicast_if(a, ifindexp)
2607         struct in_addr *a;
2608         int *ifindexp;
2609 {
2610         int ifindex;
2611         struct ifnet *ifp;
2612
2613         if (ifindexp)
2614                 *ifindexp = 0;
2615         if (ntohl(a->s_addr) >> 24 == 0) {
2616                 ifindex = ntohl(a->s_addr) & 0xffffff;
2617                 ifnet_head_lock_shared();
2618                 if (ifindex < 0 || if_index < ifindex) {
2619                         ifnet_head_done();
2620                         return NULL;
2621                 }
2622                 ifp = ifindex2ifnet[ifindex];
2623                 ifnet_head_done();
2624                 if (ifindexp)
2625                         *ifindexp = ifindex;
2626         } else {
2627                 INADDR_TO_IFP(*a, ifp);
2628         }
2629         return ifp;
2630 }
2631
2632 /*
2633  * Set the IP multicast options in response to user setsockopt().
2634  */
2635 static int
2636 ip_setmoptions(sopt, imop)
2637         struct sockopt *sopt;
2638         struct ip_moptions **imop;
2639 {
2640         int error = 0;
2641         struct in_addr addr;
2642         struct ip_mreq mreq;
2643         struct ifnet *ifp = NULL;
2644         struct ip_moptions *imo = *imop;
2645         int ifindex;
2646
2647         if (imo == NULL) {
2648                 /*
2649                  * No multicast option buffer attached to the pcb;
2650                  * allocate one and initialize to default values.
2651                  */
2652                 error = ip_createmoptions(imop);
2653                 if (error != 0)
2654                         return error;
2655                 imo = *imop;
2656         }
2657
2658         switch (sopt->sopt_name) {
2659         /* store an index number for the vif you wanna use in the send */
2660 #if MROUTING
2661         case IP_MULTICAST_VIF:
2662                 {
2663                         int i;
2664                         if (legal_vif_num == 0) {
2665                                 error = EOPNOTSUPP;
2666                                 break;
2667                         }
2668                         error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
2669                         if (error)
2670                                 break;
2671                         if (!legal_vif_num(i) && (i != -1)) {
2672                                 error = EINVAL;
2673                                 break;
2674                         }
2675                         imo->imo_multicast_vif = i;
2676                         break;
2677                 }
2678 #endif /* MROUTING */
2679
2680         case IP_MULTICAST_IF:
2681                 /*
2682                  * Select the interface for outgoing multicast packets.
2683                  */
2684                 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
2685                 if (error)
2686                         break;
2687                 /*
2688                  * INADDR_ANY is used to remove a previous selection.
2689                  * When no interface is selected, a default one is
2690                  * chosen every time a multicast packet is sent.
2691                  */
2692                 if (addr.s_addr == INADDR_ANY) {
2693                         imo->imo_multicast_ifp = NULL;
2694                         break;
2695                 }
2696                 /*
2697                  * The selected interface is identified by its local
2698                  * IP address.  Find the interface and confirm that
2699                  * it supports multicasting.
2700                  */
2701                 ifp = ip_multicast_if(&addr, &ifindex);
2702                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
2703                         error = EADDRNOTAVAIL;
2704                         break;
2705                 }
2706                 imo->imo_multicast_ifp = ifp;
2707                 if (ifindex)
2708                         imo->imo_multicast_addr = addr;
2709                 else
2710                         imo->imo_multicast_addr.s_addr = INADDR_ANY;
2711                 break;
2712
2713         case IP_MULTICAST_TTL:
2714                 /*
2715                  * Set the IP time-to-live for outgoing multicast packets.
2716                  * The original multicast API required a char argument,
2717                  * which is inconsistent with the rest of the socket API.
2718                  * We allow either a char or an int.
2719                  */
2720                 if (sopt->sopt_valsize == 1) {
2721                         u_char ttl;
2722                         error = sooptcopyin(sopt, &ttl, 1, 1);
2723                         if (error)
2724                                 break;
2725                         imo->imo_multicast_ttl = ttl;
2726                 } else {
2727                         u_int ttl;
2728                         error = sooptcopyin(sopt, &ttl, sizeof ttl,
2729                                             sizeof ttl);
2730                         if (error)
2731                                 break;
2732                         if (ttl > 255)
2733                                 error = EINVAL;
2734                         else
2735                                 imo->imo_multicast_ttl = ttl;
2736                 }
2737                 break;
2738
2739         case IP_MULTICAST_LOOP:
2740                 /*
2741                  * Set the loopback flag for outgoing multicast packets.
2742                  * Must be zero or one.  The original multicast API required a
2743                  * char argument, which is inconsistent with the rest
2744                  * of the socket API.  We allow either a char or an int.
2745                  */
2746                 if (sopt->sopt_valsize == 1) {
2747                         u_char loop;
2748                         error = sooptcopyin(sopt, &loop, 1, 1);
2749                         if (error)
2750                                 break;
2751                         imo->imo_multicast_loop = !!loop;
2752                 } else {
2753                         u_int loop;
2754                         error = sooptcopyin(sopt, &loop, sizeof loop,
2755                                             sizeof loop);
2756                         if (error)
2757                                 break;
2758                         imo->imo_multicast_loop = !!loop;
2759                 }
2760                 break;
2761
2762         case IP_ADD_MEMBERSHIP:
2763                 /*
2764                  * Add a multicast group membership.
2765                  * Group must be a valid IP multicast address.
2766                  */
2767                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2768                 if (error)
2769                         break;
2770
2771                 error = ip_addmembership(imo, &mreq);
2772                 break;
2773
2774         case IP_DROP_MEMBERSHIP:
2775                 /*
2776                  * Drop a multicast group membership.
2777                  * Group must be a valid IP multicast address.
2778                  */
2779                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2780                 if (error)
2781                         break;
2782
2783                 error = ip_dropmembership(imo, &mreq);
2784                 break;
2785
2786         default:
2787                 error = EOPNOTSUPP;
2788                 break;
2789         }
2790
2791         /*
2792          * If all options have default values, no need to keep the mbuf.
2793          */
2794         if (imo->imo_multicast_ifp == NULL &&
2795             imo->imo_multicast_vif == (u_int32_t)-1 &&
2796             imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2797             imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2798             imo->imo_num_memberships == 0) {
2799                 FREE(*imop, M_IPMOPTS);
2800                 *imop = NULL;
2801         }
2802
2803         return (error);
2804 }
2805
2806 /*
2807  * Set the IP multicast options in response to user setsockopt().
2808  */
2809 __private_extern__ int
2810 ip_createmoptions(
2811         struct ip_moptions **imop)
2812 {
2813         struct ip_moptions *imo;
2814         imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS,
2815                 M_WAITOK);
2816
2817         if (imo == NULL)
2818                 return (ENOBUFS);
2819         *imop = imo;
2820         imo->imo_multicast_ifp = NULL;
2821         imo->imo_multicast_addr.s_addr = INADDR_ANY;
2822         imo->imo_multicast_vif = -1;
2823         imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2824         imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
2825         imo->imo_num_memberships = 0;
2826
2827         return 0;
2828 }
2829
2830 /*
2831  * Add membership to an IPv4 multicast.
2832  */
2833 __private_extern__ int
2834 ip_addmembership(
2835         struct ip_moptions *imo,
2836         struct ip_mreq *mreq)
2837 {
2838         struct route ro;
2839         struct sockaddr_in *dst;
2840         struct ifnet *ifp = NULL;
2841         int error = 0;
2842         int i;
2843
2844         bzero((caddr_t)&ro, sizeof(ro));
2845
2846         if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) {
2847                 error = EINVAL;
2848                 goto done;
2849         }
2850         /*
2851          * If no interface address was provided, use the interface of
2852          * the route to the given multicast address.
2853          */
2854         if (mreq->imr_interface.s_addr == INADDR_ANY) {
2855                 dst = (struct sockaddr_in *)&ro.ro_dst;
2856                 dst->sin_len = sizeof(*dst);
2857                 dst->sin_family = AF_INET;
2858                 dst->sin_addr = mreq->imr_multiaddr;
2859                 rtalloc_ign(&ro, 0);
2860                 if (ro.ro_rt != NULL) {
2861                         ifp = ro.ro_rt->rt_ifp;
2862                 } else {
2863                         /* If there's no default route, try using loopback */
2864                         mreq->imr_interface.s_addr = htonl(INADDR_LOOPBACK);
2865                 }
2866         }
2867
2868         if (ifp == NULL) {
2869                 ifp = ip_multicast_if(&mreq->imr_interface, NULL);
2870         }
2871
2872         /*
2873          * See if we found an interface, and confirm that it
2874          * supports multicast.
2875          */
2876         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
2877                 error = EADDRNOTAVAIL;
2878                 goto done;
2879         }
2880         /*
2881          * See if the membership already exists or if all the
2882          * membership slots are full.
2883          */
2884         for (i = 0; i < imo->imo_num_memberships; ++i) {
2885                 if (imo->imo_membership[i]->inm_ifp == ifp &&
2886                         imo->imo_membership[i]->inm_addr.s_addr
2887                                         == mreq->imr_multiaddr.s_addr)
2888                         break;
2889         }
2890         if (i < imo->imo_num_memberships) {
2891                 error = EADDRINUSE;
2892                 goto done;
2893         }
2894         if (i == IP_MAX_MEMBERSHIPS) {
2895                 error = ETOOMANYREFS;
2896                 goto done;
2897         }
2898         /*
2899          * Everything looks good; add a new record to the multicast
2900          * address list for the given interface.
2901          */
2902         if ((imo->imo_membership[i] =
2903                 in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) {
2904                 error = ENOBUFS;
2905                 goto done;
2906         }
2907         ++imo->imo_num_memberships;
2908
2909 done:
2910         if (ro.ro_rt != NULL)
2911                 rtfree(ro.ro_rt);
2912
2913         return error;
2914 }
2915
2916 /*
2917  * Drop membership of an IPv4 multicast.
2918  */
2919 __private_extern__ int
2920 ip_dropmembership(
2921         struct ip_moptions *imo,
2922         struct ip_mreq *mreq)
2923 {
2924         int error = 0;
2925         struct ifnet* ifp = NULL;
2926         int i;
2927
2928         if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) {
2929                 error = EINVAL;
2930                 return error;
2931         }
2932
2933         /*
2934          * If an interface address was specified, get a pointer
2935          * to its ifnet structure.
2936          */
2937         if (mreq->imr_interface.s_addr == INADDR_ANY)
2938                 ifp = NULL;
2939         else {
2940                 ifp = ip_multicast_if(&mreq->imr_interface, NULL);
2941                 if (ifp == NULL) {
2942                         error = EADDRNOTAVAIL;
2943                         return error;
2944                 }
2945         }
2946         /*
2947          * Find the membership in the membership array.
2948          */
2949         for (i = 0; i < imo->imo_num_memberships; ++i) {
2950                 if ((ifp == NULL ||
2951                          imo->imo_membership[i]->inm_ifp == ifp) &&
2952                          imo->imo_membership[i]->inm_addr.s_addr ==
2953                          mreq->imr_multiaddr.s_addr)
2954                         break;
2955         }
2956         if (i == imo->imo_num_memberships) {
2957                 error = EADDRNOTAVAIL;
2958                 return error;
2959         }
2960         /*
2961          * Give up the multicast address record to which the
2962          * membership points.
2963          */
2964         in_delmulti(&imo->imo_membership[i]);
2965         /*
2966          * Remove the gap in the membership array.
2967          */
2968         for (++i; i < imo->imo_num_memberships; ++i)
2969                 imo->imo_membership[i-1] = imo->imo_membership[i];
2970         --imo->imo_num_memberships;
2971
2972         return error;
2973 }
2974
2975 /*
2976  * Return the IP multicast options in response to user getsockopt().
2977  */
2978 static int
2979 ip_getmoptions(sopt, imo)
2980         struct sockopt *sopt;
2981         register struct ip_moptions *imo;
2982 {
2983         struct in_addr addr;
2984         struct in_ifaddr *ia;
2985         int error, optval;
2986         u_char coptval;
2987
2988         error = 0;
2989         switch (sopt->sopt_name) {
2990 #if MROUTING
2991         case IP_MULTICAST_VIF:
2992                 if (imo != NULL)
2993                         optval = imo->imo_multicast_vif;
2994                 else
2995                         optval = -1;
2996                 error = sooptcopyout(sopt, &optval, sizeof optval);
2997                 break;
2998 #endif /* MROUTING */
2999
3000         case IP_MULTICAST_IF:
3001                 if (imo == NULL || imo->imo_multicast_ifp == NULL)
3002                         addr.s_addr = INADDR_ANY;
3003                 else if (imo->imo_multicast_addr.s_addr) {
3004                         /* return the value user has set */
3005                         addr = imo->imo_multicast_addr;
3006                 } else {
3007                         IFP_TO_IA(imo->imo_multicast_ifp, ia);
3008                         addr.s_addr = (ia == NULL) ? INADDR_ANY
3009                                 : IA_SIN(ia)->sin_addr.s_addr;
3010                         if (ia != NULL)
3011                                 ifafree(&ia->ia_ifa);
3012                 }
3013                 error = sooptcopyout(sopt, &addr, sizeof addr);
3014                 break;
3015
3016         case IP_MULTICAST_TTL:
3017                 if (imo == 0)
3018                         optval = coptval = IP_DEFAULT_MULTICAST_TTL;
3019                 else
3020                         optval = coptval = imo->imo_multicast_ttl;
3021                 if (sopt->sopt_valsize == 1)
3022                         error = sooptcopyout(sopt, &coptval, 1);
3023                 else
3024                         error = sooptcopyout(sopt, &optval, sizeof optval);
3025                 break;
3026
3027         case IP_MULTICAST_LOOP:
3028                 if (imo == 0)
3029                         optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
3030                 else
3031                         optval = coptval = imo->imo_multicast_loop;
3032                 if (sopt->sopt_valsize == 1)
3033                         error = sooptcopyout(sopt, &coptval, 1);
3034                 else
3035                         error = sooptcopyout(sopt, &optval, sizeof optval);
3036                 break;
3037
3038         default:
3039                 error = ENOPROTOOPT;
3040                 break;
3041         }
3042         return (error);
3043 }
3044
3045 /*
3046  * Discard the IP multicast options.
3047  */
3048 void
3049 ip_freemoptions(imo)
3050         register struct ip_moptions *imo;
3051 {
3052         register int i;
3053
3054         if (imo != NULL) {
3055                 for (i = 0; i < imo->imo_num_memberships; ++i)
3056                         in_delmulti(&imo->imo_membership[i]);
3057                 FREE(imo, M_IPMOPTS);
3058         }
3059 }
3060
3061 /*
3062  * Routine called from ip_output() to loop back a copy of an IP multicast
3063  * packet to the input queue of a specified interface.  Note that this
3064  * calls the output routine of the loopback "driver", but with an interface
3065  * pointer that might NOT be a loopback interface -- evil, but easier than
3066  * replicating that code here.
3067  */
3068 static void
3069 ip_mloopback(ifp, m, dst, hlen)
3070         struct ifnet *ifp;
3071         register struct mbuf *m;
3072         register struct sockaddr_in *dst;
3073         int hlen;
3074 {
3075         register struct ip *ip;
3076         struct mbuf *copym;
3077         int sw_csum = (apple_hwcksum_tx == 0);
3078
3079         copym = m_copy(m, 0, M_COPYALL);
3080         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
3081                 copym = m_pullup(copym, hlen);
3082
3083         if (copym == NULL)
3084                 return;
3085
3086         /*
3087          * We don't bother to fragment if the IP length is greater
3088          * than the interface's MTU.  Can this possibly matter?
3089          */
3090         ip = mtod(copym, struct ip *);
3091
3092 #if BYTE_ORDER != BIG_ENDIAN
3093         HTONS(ip->ip_len);
3094         HTONS(ip->ip_off);
3095 #endif
3096
3097         ip->ip_sum = 0;
3098         ip->ip_sum = in_cksum(copym, hlen);
3099         /*
3100          * NB:
3101          * It's not clear whether there are any lingering
3102          * reentrancy problems in other areas which might
3103          * be exposed by using ip_input directly (in
3104          * particular, everything which modifies the packet
3105          * in-place).  Yet another option is using the
3106          * protosw directly to deliver the looped back
3107          * packet.  For the moment, we'll err on the side
3108          * of safety by using if_simloop().
3109          */
3110 #if 1 /* XXX */
3111         if (dst->sin_family != AF_INET) {
3112                 printf("ip_mloopback: bad address family %d\n",
3113                                         dst->sin_family);
3114                 dst->sin_family = AF_INET;
3115         }
3116 #endif
3117
3118         /*
3119          * Mark checksum as valid or calculate checksum for loopback.
3120          *
3121          * This is done this way because we have to embed the ifp of
3122          * the interface we will send the original copy of the packet
3123          * out on in the mbuf. ip_input will check if_hwassist of the
3124          * embedded ifp and ignore all csum_flags if if_hwassist is 0.
3125          * The UDP checksum has not been calculated yet.
3126          */
3127         if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) {
3128                 if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) {
3129                         copym->m_pkthdr.csum_flags |=
3130                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3131                             CSUM_IP_CHECKED | CSUM_IP_VALID;
3132                         copym->m_pkthdr.csum_data = 0xffff;
3133                 } else {
3134
3135 #if BYTE_ORDER != BIG_ENDIAN
3136                         NTOHS(ip->ip_len);
3137 #endif
3138
3139                         in_delayed_cksum(copym);
3140
3141 #if BYTE_ORDER != BIG_ENDIAN
3142                         HTONS(ip->ip_len);
3143 #endif
3144
3145                 }
3146         }
3147
3148         /*
3149          * TedW:
3150          * We need to send all loopback traffic down to dlil in case
3151          * a filter has tapped-in.
3152          */
3153
3154         /*
3155          * Stuff the 'real' ifp into the pkthdr, to be used in matching
3156          *  in ip_input(); we need the loopback ifp/dl_tag passed as args
3157          *  to make the loopback driver compliant with the data link
3158          *  requirements.
3159          */
3160         if (lo_ifp) {
3161                 copym->m_pkthdr.rcvif = ifp;
3162                 dlil_output(lo_ifp, PF_INET, copym, 0,
3163                     (struct sockaddr *) dst, 0);
3164         } else {
3165                 printf("Warning: ip_output call to dlil_find_dltag failed!\n");
3166                 m_freem(copym);
3167         }
3168 }
3169
3170 /*
3171  * Given a source IP address (and route, if available), determine the best
3172  * interface to send the packet from.  Checking for (and updating) the
3173  * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3174  * without any locks based on the assumption that ip_output() is single-
3175  * threaded per-pcb, i.e. for any given pcb there can only be one thread
3176  * performing output at the IP layer.
3177  */
3178 static struct ifaddr *
3179 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3180 {
3181         struct ifaddr *ifa = NULL;
3182         struct in_addr src = ip->ip_src;
3183         struct in_addr dst = ip->ip_dst;
3184         struct ifnet *rt_ifp;
3185         char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3186
3187         if (ip_select_srcif_debug) {
3188                 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3189                 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3190         }
3191
3192         if (ro->ro_rt != NULL)
3193                 RT_LOCK(ro->ro_rt);
3194
3195         rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3196
3197         /*
3198          * Given the source IP address, find a suitable source interface
3199          * to use for transmission; if the caller has specified a scope,
3200          * optimize the search by looking at the addresses only for that
3201          * interface.  This is still suboptimal, however, as we need to
3202          * traverse the per-interface list.
3203          */
3204         if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3205                 unsigned int scope = ifscope;
3206
3207                 /*
3208                  * If no scope is specified and the route is stale (pointing
3209                  * to a defunct interface) use the current primary interface;
3210                  * this happens when switching between interfaces configured
3211                  * with the same IP address.  Otherwise pick up the scope
3212                  * information from the route; the ULP may have looked up a
3213                  * correct route and we just need to verify it here and mark
3214                  * it with the ROF_SRCIF_SELECTED flag below.
3215                  */
3216                 if (scope == IFSCOPE_NONE) {
3217                         scope = rt_ifp->if_index;
3218                         if (scope != get_primary_ifscope() &&
3219                             ro->ro_rt->generation_id != route_generation)
3220                                 scope = get_primary_ifscope();
3221                 }
3222
3223                 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3224
3225                 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3226                     ip->ip_p != IPPROTO_TCP && ipforwarding) {
3227                         /*
3228                          * If forwarding is enabled, and if the packet isn't
3229                          * TCP or UDP, check if the source address belongs
3230                          * to one of our own interfaces; if so, demote the
3231                          * interface scope and do a route lookup right below.
3232                          */
3233                         ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3234                         if (ifa != NULL) {
3235                                 ifafree(ifa);
3236                                 ifa = NULL;
3237                                 ifscope = IFSCOPE_NONE;
3238                         }
3239                 }
3240
3241                 if (ip_select_srcif_debug && ifa != NULL) {
3242                         if (ro->ro_rt != NULL) {
3243                                 printf("%s->%s ifscope %d->%d ifa_if %s%d "
3244                                     "ro_if %s%d\n", s_src, s_dst, ifscope,
3245                                     scope, ifa->ifa_ifp->if_name,
3246                                     ifa->ifa_ifp->if_unit, rt_ifp->if_name,
3247                                     rt_ifp->if_unit);
3248                         } else {
3249                                 printf("%s->%s ifscope %d->%d ifa_if %s%d\n",
3250                                     s_src, s_dst, ifscope, scope,
3251                                     ifa->ifa_ifp->if_name,
3252                                     ifa->ifa_ifp->if_unit);
3253                         }
3254                 }
3255         }
3256
3257         /*
3258          * Slow path; search for an interface having the corresponding source
3259          * IP address if the scope was not specified by the caller, and:
3260          *
3261          *   1) There currently isn't any route, or,
3262          *   2) The interface used by the route does not own that source
3263          *      IP address; in this case, the route will get blown away
3264          *      and we'll do a more specific scoped search using the newly
3265          *      found interface.
3266          */
3267         if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3268                 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3269
3270                 /*
3271                  * If we have the IP address, but not the route, we don't
3272                  * really know whether or not it belongs to the correct
3273                  * interface (it could be shared across multiple interfaces.)
3274                  * The only way to find out is to do a route lookup.
3275                  */
3276                 if (ifa != NULL && ro->ro_rt == NULL) {
3277                         struct rtentry *rt;
3278                         struct sockaddr_in sin;
3279                         struct ifaddr *oifa = NULL;
3280
3281                         bzero(&sin, sizeof (sin));
3282                         sin.sin_family = AF_INET;
3283                         sin.sin_len = sizeof (sin);
3284                         sin.sin_addr = dst;
3285
3286                         lck_mtx_lock(rnh_lock);
3287                         if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL,
3288                             rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3289                                 RT_LOCK(rt);
3290                                 /*
3291                                  * If the route uses a different interface,
3292                                  * use that one instead.  The IP address of
3293                                  * the ifaddr that we pick up here is not
3294                                  * relevant.
3295                                  */
3296                                 if (ifa->ifa_ifp != rt->rt_ifp) {
3297                                         oifa = ifa;
3298                                         ifa = rt->rt_ifa;
3299                                         ifaref(ifa);
3300                                         RT_UNLOCK(rt);
3301                                 } else {
3302                                         RT_UNLOCK(rt);
3303                                 }
3304                                 rtfree_locked(rt);
3305                         }
3306                         lck_mtx_unlock(rnh_lock);
3307
3308                         if (oifa != NULL) {
3309                                 struct ifaddr *iifa;
3310
3311                                 /*
3312                                  * See if the interface pointed to by the
3313                                  * route is configured with the source IP
3314                                  * address of the packet.
3315                                  */
3316                                 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3317                                     src.s_addr, ifa->ifa_ifp->if_index);
3318
3319                                 if (iifa != NULL) {
3320                                         /*
3321                                          * Found it; drop the original one
3322                                          * as well as the route interface
3323                                          * address, and use this instead.
3324                                          */
3325                                         ifafree(oifa);
3326                                         ifafree(ifa);
3327                                         ifa = iifa;
3328                                 } else if (!ipforwarding ||
3329                                     (rt->rt_flags & RTF_GATEWAY)) {
3330                                         /*
3331                                          * This interface doesn't have that
3332                                          * source IP address; drop the route
3333                                          * interface address and just use the
3334                                          * original one, and let the caller
3335                                          * do a scoped route lookup.
3336                                          */
3337                                         ifafree(ifa);
3338                                         ifa = oifa;
3339                                 } else {
3340                                         /*
3341                                          * Forwarding is enabled and the source
3342                                          * address belongs to one of our own
3343                                          * interfaces which isn't the outgoing
3344                                          * interface, and we have a route, and
3345                                          * the destination is on a network that
3346                                          * is directly attached (onlink); drop
3347                                          * the original one and use the route
3348                                          * interface address instead.
3349                                          */
3350                                         ifafree(oifa);
3351                                 }
3352                         }
3353                 } else if (ifa != NULL && ro->ro_rt != NULL &&
3354                     !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3355                     ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3356                         /*
3357                          * Forwarding is enabled and the source address belongs
3358                          * to one of our own interfaces which isn't the same
3359                          * as the interface used by the known route; drop the
3360                          * original one and use the route interface address.
3361                          */
3362                         ifafree(ifa);
3363                         ifa = ro->ro_rt->rt_ifa;
3364                         ifaref(ifa);
3365                 }
3366
3367                 if (ip_select_srcif_debug && ifa != NULL) {
3368                         printf("%s->%s ifscope %d ifa_if %s%d\n",
3369                             s_src, s_dst, ifscope, ifa->ifa_ifp->if_name,
3370                             ifa->ifa_ifp->if_unit);
3371                 }
3372         }
3373
3374         if (ro->ro_rt != NULL)
3375                 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3376         /*
3377          * If there is a non-loopback route with the wrong interface, or if
3378          * there is no interface configured with such an address, blow it
3379          * away.  Except for local/loopback, we look for one with a matching
3380          * interface scope/index.
3381          */
3382         if (ro->ro_rt != NULL &&
3383             (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3384             !(ro->ro_rt->rt_flags & RTF_UP))) {
3385                 if (ip_select_srcif_debug) {
3386                         if (ifa != NULL) {
3387                                 printf("%s->%s ifscope %d ro_if %s%d != "
3388                                     "ifa_if %s%d (cached route cleared)\n",
3389                                     s_src, s_dst, ifscope, rt_ifp->if_name,
3390                                     rt_ifp->if_unit, ifa->ifa_ifp->if_name,
3391                                     ifa->ifa_ifp->if_unit);
3392                         } else {
3393                                 printf("%s->%s ifscope %d ro_if %s%d "
3394                                     "(no ifa_if found)\n",
3395                                     s_src, s_dst, ifscope, rt_ifp->if_name,
3396                                     rt_ifp->if_unit);
3397                         }
3398                 }
3399
3400                 RT_UNLOCK(ro->ro_rt);
3401                 rtfree(ro->ro_rt);
3402                 ro->ro_rt = NULL;
3403                 ro->ro_flags &= ~ROF_SRCIF_SELECTED;
3404
3405                 /*
3406                  * If the destination is IPv4 LLA and the route's interface
3407                  * doesn't match the source interface, then the source IP
3408                  * address is wrong; it most likely belongs to the primary
3409                  * interface associated with the IPv4 LL subnet.  Drop the
3410                  * packet rather than letting it go out and return an error
3411                  * to the ULP.  This actually applies not only to IPv4 LL
3412                  * but other shared subnets; for now we explicitly test only
3413                  * for the former case and save the latter for future.
3414                  */
3415                 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3416                     !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3417                         ifafree(ifa);
3418                         ifa = NULL;
3419                 }
3420         }
3421
3422         if (ip_select_srcif_debug && ifa == NULL) {
3423                 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3424                     s_src, s_dst, ifscope);
3425         }
3426
3427         /*
3428          * If there is a route, mark it accordingly.  If there isn't one,
3429          * we'll get here again during the next transmit (possibly with a
3430          * route) and the flag will get set at that point.  For IPv4 LLA
3431          * destination, mark it only if the route has been fully resolved;
3432          * otherwise we want to come back here again when the route points
3433          * to the interface over which the ARP reply arrives on.
3434          */
3435         if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3436             (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3437             SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3438                 ro->ro_flags |= ROF_SRCIF_SELECTED;
3439                 ro->ro_rt->generation_id = route_generation;
3440         }
3441
3442         if (ro->ro_rt != NULL)
3443                 RT_UNLOCK(ro->ro_rt);
3444
3445         return (ifa);
3446 }
3447
3448 /*
3449  * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option.
3450  */
3451 static void
3452 ip_bindif(struct inpcb *inp, unsigned int ifscope)
3453 {
3454         /*
3455          * A zero interface scope value indicates an "unbind".
3456          * Otherwise, take in whatever value the app desires;
3457          * the app may already know the scope (or force itself
3458          * to such a scope) ahead of time before the interface
3459          * gets attached.  It doesn't matter either way; any
3460          * route lookup from this point on will require an
3461          * exact match for the embedded interface scope.
3462          */
3463         inp->inp_boundif = ifscope;
3464         if (inp->inp_boundif == IFSCOPE_NONE)
3465                 inp->inp_flags &= ~INP_BOUND_IF;
3466         else
3467                 inp->inp_flags |= INP_BOUND_IF;
3468
3469         /* Blow away any cached route in the PCB */
3470         if (inp->inp_route.ro_rt != NULL) {
3471                 rtfree(inp->inp_route.ro_rt);
3472                 inp->inp_route.ro_rt = NULL;
3473         }
3474 }