bsd/netinet/ip_input.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_input.c  8.2 (Berkeley) 1/4/94
  61  */
  62 /*
  63  * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce
  64  * support for mandatory and extensible security protections.  This notice
  65  * is included in support of clause 2.2 (b) of the Apple Public License,
  66  * Version 2.0.
  67  */
  68
  69 #define _IP_VHL
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/malloc.h>
  75 #include <sys/domain.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/time.h>
  79 #include <sys/kernel.h>
  80 #include <sys/syslog.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/mcache.h>
  83 #include <sys/socketvar.h>
  84 #include <sys/kdebug.h>
  85 #include <mach/mach_time.h>
  86 #include <mach/sdt.h>
  87
  88 #include <machine/endian.h>
  89 #include <dev/random/randomdev.h>
  90
  91 #include <kern/queue.h>
  92 #include <kern/locks.h>
  93 #include <libkern/OSAtomic.h>
  94
  95 #include <pexpert/pexpert.h>
  96
  97 #include <net/if.h>
  98 #include <net/if_var.h>
  99 #include <net/if_dl.h>
 100 #include <net/route.h>
 101 #include <net/kpi_protocol.h>
 102 #include <net/ntstat.h>
 103 #include <net/dlil.h>
 104 #include <net/classq/classq.h>
 105 #include <net/net_perf.h>
 106 #include <net/init.h>
 107 #if PF
 108 #include <net/pfvar.h>
 109 #endif /* PF */
 110
 111 #include <netinet/in.h>
 112 #include <netinet/in_systm.h>
 113 #include <netinet/in_var.h>
 114 #include <netinet/in_arp.h>
 115 #include <netinet/ip.h>
 116 #include <netinet/in_pcb.h>
 117 #include <netinet/ip_var.h>
 118 #include <netinet/ip_icmp.h>
 119 #include <netinet/ip_fw.h>
 120 #include <netinet/ip_divert.h>
 121 #include <netinet/kpi_ipfilter_var.h>
 122 #include <netinet/udp.h>
 123 #include <netinet/udp_var.h>
 124 #include <netinet/bootp.h>
 125 #include <netinet/lro_ext.h>
 126
 127 #if DUMMYNET
 128 #include <netinet/ip_dummynet.h>
 129 #endif /* DUMMYNET */
 130
 131 #if CONFIG_MACF_NET
 132 #include <security/mac_framework.h>
 133 #endif /* CONFIG_MACF_NET */
 134
 135 #if IPSEC
 136 #include <netinet6/ipsec.h>
 137 #include <netkey/key.h>
 138 #endif /* IPSEC */
 139
 140 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 0)
 141 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 2)
 142 #define DBG_FNC_IP_INPUT        NETDBG_CODE(DBG_NETIP, (2 << 8))
 143
 144 #if IPSEC
 145 extern int ipsec_bypass;
 146 extern lck_mtx_t *sadb_mutex;
 147
 148 lck_grp_t       *sadb_stat_mutex_grp;
 149 lck_grp_attr_t  *sadb_stat_mutex_grp_attr;
 150 lck_attr_t      *sadb_stat_mutex_attr;
 151 decl_lck_mtx_data(, sadb_stat_mutex_data);
 152 lck_mtx_t       *sadb_stat_mutex = &sadb_stat_mutex_data;
 153 #endif /* IPSEC */
 154
 155 MBUFQ_HEAD(fq_head);
 156
 157 static int frag_timeout_run;            /* frag timer is scheduled to run */
 158 static void frag_timeout(void *);
 159 static void frag_sched_timeout(void);
 160
 161 static struct ipq *ipq_alloc(int);
 162 static void ipq_free(struct ipq *);
 163 static void ipq_updateparams(void);
 164 static void ip_input_second_pass(struct mbuf *, struct ifnet *,
 165     u_int32_t, int, int, struct ip_fw_in_args *, int);
 166
 167 decl_lck_mtx_data(static, ipqlock);
 168 static lck_attr_t       *ipqlock_attr;
 169 static lck_grp_t        *ipqlock_grp;
 170 static lck_grp_attr_t   *ipqlock_grp_attr;
 171
 172 /* Packet reassembly stuff */
 173 #define IPREASS_NHASH_LOG2      6
 174 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
 175 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
 176 #define IPREASS_HASH(x, y) \
 177         (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
 178
 179 /* IP fragment reassembly queues (protected by ipqlock) */
 180 static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */
 181 static int maxnipq;                     /* max packets in reass queues */
 182 static u_int32_t maxfragsperpacket;     /* max frags/packet in reass queues */
 183 static u_int32_t nipq;                  /* # of packets in reass queues */
 184 static u_int32_t ipq_limit;             /* ipq allocation limit */
 185 static u_int32_t ipq_count;             /* current # of allocated ipq's */
 186
 187 static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS;
 188 static int sysctl_maxnipq SYSCTL_HANDLER_ARGS;
 189 static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS;
 190
 191 #if (DEBUG || DEVELOPMENT)
 192 static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS;
 193 static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS;
 194 static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS;
 195 #endif /* (DEBUG || DEVELOPMENT) */
 196
 197 int ipforwarding = 0;
 198 SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding,
 199         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0,
 200         sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces");
 201
 202 static int ipsendredirects = 1; /* XXX */
 203 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect,
 204         CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0,
 205         "Enable sending IP redirects");
 206
 207 int ip_defttl = IPDEFTTL;
 208 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED,
 209         &ip_defttl, 0, "Maximum TTL on IP packets");
 210
 211 static int ip_dosourceroute = 0;
 212 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
 213         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0,
 214         "Enable forwarding source routed IP packets");
 215
 216 static int ip_acceptsourceroute = 0;
 217 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
 218         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0,
 219         "Enable accepting source routed IP packets");
 220
 221 static int ip_sendsourcequench = 0;
 222 SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench,
 223         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0,
 224         "Enable the transmission of source quench packets");
 225
 226 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
 227         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq,
 228         "I", "Maximum number of IPv4 fragment reassembly queue entries");
 229
 230 SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED,
 231         &nipq, 0, "Current number of IPv4 fragment reassembly queue entries");
 232
 233 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket,
 234         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0,
 235         sysctl_maxfragsperpacket, "I",
 236         "Maximum number of IPv4 fragments allowed per packet");
 237
 238 static uint32_t ip_adj_clear_hwcksum = 0;
 239 SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum,
 240         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0,
 241         "Invalidate hwcksum info when adjusting length");
 242
 243 static uint32_t ip_adj_partial_sum = 1;
 244 SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_partial_sum,
 245         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_partial_sum, 0,
 246         "Perform partial sum adjustment of trailing bytes at IP layer");
 247
 248 /*
 249  * XXX - Setting ip_checkinterface mostly implements the receive side of
 250  * the Strong ES model described in RFC 1122, but since the routing table
 251  * and transmit implementation do not implement the Strong ES model,
 252  * setting this to 1 results in an odd hybrid.
 253  *
 254  * XXX - ip_checkinterface currently must be disabled if you use ipnat
 255  * to translate the destination address to another local interface.
 256  *
 257  * XXX - ip_checkinterface must be disabled if you add IP aliases
 258  * to the loopback interface instead of the interface where the
 259  * packets for those addresses are received.
 260  */
 261 static int ip_checkinterface = 0;
 262 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED,
 263         &ip_checkinterface, 0, "Verify packet arrives on correct interface");
 264
 265 static int ip_chaining = 1;
 266 SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED,
 267         &ip_chaining, 1, "Do receive side ip address based chaining");
 268
 269 static int ip_chainsz = 6;
 270 SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED,
 271         &ip_chainsz, 1, "IP receive side max chaining");
 272
 273 #if (DEBUG || DEVELOPMENT)
 274 static int ip_input_measure = 0;
 275 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf,
 276         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 277         &ip_input_measure, 0, sysctl_reset_ip_input_stats, "I", "Do time measurement");
 278
 279 static uint64_t ip_input_measure_bins = 0;
 280 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins,
 281         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_input_measure_bins, 0,
 282         sysctl_ip_input_measure_bins, "I",
 283         "bins for chaining performance data histogram");
 284
 285 static net_perf_t net_perf;
 286 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data,
 287         CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 288         0, 0, sysctl_ip_input_getperf, "S,net_perf",
 289         "IP input performance data (struct net_perf, net/net_perf.h)");
 290 #endif /* (DEBUG || DEVELOPMENT) */
 291
 292 #if DIAGNOSTIC
 293 static int ipprintfs = 0;
 294 #endif
 295
 296 struct protosw *ip_protox[IPPROTO_MAX];
 297
 298 static lck_grp_attr_t   *in_ifaddr_rwlock_grp_attr;
 299 static lck_grp_t        *in_ifaddr_rwlock_grp;
 300 static lck_attr_t       *in_ifaddr_rwlock_attr;
 301 decl_lck_rw_data(, in_ifaddr_rwlock_data);
 302 lck_rw_t                *in_ifaddr_rwlock = &in_ifaddr_rwlock_data;
 303
 304 /* Protected by in_ifaddr_rwlock */
 305 struct in_ifaddrhead in_ifaddrhead;             /* first inet address */
 306 struct in_ifaddrhashhead *in_ifaddrhashtbl;     /* inet addr hash table  */
 307
 308 #define INADDR_NHASH    61
 309 static u_int32_t inaddr_nhash;                  /* hash table size */
 310 static u_int32_t inaddr_hashp;                  /* next largest prime */
 311
 312 static int ip_getstat SYSCTL_HANDLER_ARGS;
 313 struct ipstat ipstat;
 314 SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats,
 315         CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 316         0, 0, ip_getstat, "S,ipstat",
 317         "IP statistics (struct ipstat, netinet/ip_var.h)");
 318
 319 #if IPCTL_DEFMTU
 320 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED,
 321         &ip_mtu, 0, "Default MTU");
 322 #endif /* IPCTL_DEFMTU */
 323
 324 #if IPSTEALTH
 325 static int      ipstealth = 0;
 326 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED,
 327         &ipstealth, 0, "");
 328 #endif /* IPSTEALTH */
 329
 330 /* Firewall hooks */
 331 #if IPFIREWALL
 332 ip_fw_chk_t *ip_fw_chk_ptr;
 333 int fw_enable = 1;
 334 int fw_bypass = 1;
 335 int fw_one_pass = 0;
 336 #endif /* IPFIREWALL */
 337
 338 #if DUMMYNET
 339 ip_dn_io_t *ip_dn_io_ptr;
 340 #endif /* DUMMYNET */
 341
 342 SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal,
 343         CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local");
 344
 345 struct ip_linklocal_stat ip_linklocal_stat;
 346 SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat,
 347         CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat,
 348         "Number of link local packets with TTL less than 255");
 349
 350 SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in,
 351         CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input");
 352
 353 int ip_linklocal_in_allowbadttl = 1;
 354 SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl,
 355         CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0,
 356         "Allow incoming link local packets with TTL less than 255");
 357
 358
 359 /*
 360  * We need to save the IP options in case a protocol wants to respond
 361  * to an incoming packet over the same route if the packet got here
 362  * using IP source routing.  This allows connection establishment and
 363  * maintenance when the remote end is on a network that is not known
 364  * to us.
 365  */
 366 static int      ip_nhops = 0;
 367 static  struct ip_srcrt {
 368         struct  in_addr dst;                    /* final destination */
 369         char    nop;                            /* one NOP to align */
 370         char    srcopt[IPOPT_OFFSET + 1];       /* OPTVAL, OLEN and OFFSET */
 371         struct  in_addr route[MAX_IPOPTLEN / sizeof (struct in_addr)];
 372 } ip_srcrt;
 373
 374 static void in_ifaddrhashtbl_init(void);
 375 static void save_rte(u_char *, struct in_addr);
 376 static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *);
 377 static void ip_forward(struct mbuf *, int, struct sockaddr_in *);
 378 static void frag_freef(struct ipqhead *, struct ipq *);
 379 #if IPDIVERT
 380 #ifdef IPDIVERT_44
 381 static struct mbuf *ip_reass(struct mbuf *, u_int32_t *, u_int16_t *);
 382 #else /* !IPDIVERT_44 */
 383 static struct mbuf *ip_reass(struct mbuf *, u_int16_t *, u_int16_t *);
 384 #endif /* !IPDIVERT_44 */
 385 #else /* !IPDIVERT */
 386 static struct mbuf *ip_reass(struct mbuf *);
 387 #endif /* !IPDIVERT */
 388 static void ip_fwd_route_copyout(struct ifnet *, struct route *);
 389 static void ip_fwd_route_copyin(struct ifnet *, struct route *);
 390 static inline u_short ip_cksum(struct mbuf *, int);
 391
 392 int ip_use_randomid = 1;
 393 SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED,
 394         &ip_use_randomid, 0, "Randomize IP packets IDs");
 395
 396 /*
 397  * On platforms which require strict alignment (currently for anything but
 398  * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not,
 399  * copy the contents of the mbuf chain into a new chain, and free the original
 400  * one.  Create some head room in the first mbuf of the new chain, in case
 401  * it's needed later on.
 402  */
 403 #if defined(__i386__) || defined(__x86_64__)
 404 #define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0)
 405 #else /* !__i386__ && !__x86_64__ */
 406 #define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do {                  \
 407         if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) {                     \
 408                 struct mbuf *_n;                                        \
 409                 struct ifnet *__ifp = (_ifp);                           \
 410                 atomic_add_64(&(__ifp)->if_alignerrs, 1);               \
 411                 if (((_m)->m_flags & M_PKTHDR) &&                       \
 412                     (_m)->m_pkthdr.pkt_hdr != NULL)                     \
 413                         (_m)->m_pkthdr.pkt_hdr = NULL;                  \
 414                 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT);        \
 415                 if (_n == NULL) {                                       \
 416                         atomic_add_32(&ipstat.ips_toosmall, 1);         \
 417                         m_freem(_m);                                    \
 418                         (_m) = NULL;                                    \
 419                         _action;                                        \
 420                 } else {                                                \
 421                         VERIFY(_n != (_m));                             \
 422                         (_m) = _n;                                      \
 423                 }                                                       \
 424         }                                                               \
 425 } while (0)
 426 #endif /* !__i386__ && !__x86_64__ */
 427
 428 /*
 429  * GRE input handler function, settable via ip_gre_register_input() for PPTP.
 430  */
 431 static gre_input_func_t gre_input_func;
 432
 433 static void
 434 ip_init_delayed(void)
 435 {
 436         struct ifreq ifr;
 437         int error;
 438         struct sockaddr_in *sin;
 439
 440         bzero(&ifr, sizeof(ifr));
 441         strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name));
 442         sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr;
 443         sin->sin_len = sizeof(struct sockaddr_in);
 444         sin->sin_family = AF_INET;
 445         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 446         error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc);
 447         if (error)
 448                 printf("%s: failed to initialise lo0's address, error=%d\n",
 449                     __func__, error);
 450 }
 451
 452 /*
 453  * IP initialization: fill in IP protocol switch table.
 454  * All protocols not implemented in kernel go to raw IP protocol handler.
 455  */
 456 void
 457 ip_init(struct protosw *pp, struct domain *dp)
 458 {
 459         static int ip_initialized = 0;
 460         struct protosw *pr;
 461         struct timeval tv;
 462         int i;
 463
 464         domain_proto_mtx_lock_assert_held();
 465         VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
 466
 467         /* ipq_alloc() uses mbufs for IP fragment queue structures */
 468         _CASSERT(sizeof (struct ipq) <= _MLEN);
 469
 470         /*
 471          * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is
 472          * interchangeable with in_aliasreq; they must have the same size.
 473          */
 474         _CASSERT(sizeof (struct ifaliasreq) == sizeof (struct in_aliasreq));
 475
 476         if (ip_initialized)
 477                 return;
 478         ip_initialized = 1;
 479
 480         in_ifaddr_init();
 481
 482         in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init();
 483         in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock",
 484             in_ifaddr_rwlock_grp_attr);
 485         in_ifaddr_rwlock_attr = lck_attr_alloc_init();
 486         lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp,
 487             in_ifaddr_rwlock_attr);
 488
 489         TAILQ_INIT(&in_ifaddrhead);
 490         in_ifaddrhashtbl_init();
 491
 492         ip_moptions_init();
 493
 494         pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW);
 495         if (pr == NULL) {
 496                 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n",
 497                     __func__);
 498                 /* NOTREACHED */
 499         }
 500
 501         /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 502         for (i = 0; i < IPPROTO_MAX; i++)
 503                 ip_protox[i] = pr;
 504         /*
 505          * Cycle through IP protocols and put them into the appropriate place
 506          * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}.
 507          */
 508         VERIFY(dp == inetdomain && dp->dom_family == PF_INET);
 509         TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) {
 510                 VERIFY(pr->pr_domain == dp);
 511                 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) {
 512                         /* Be careful to only index valid IP protocols. */
 513                         if (pr->pr_protocol < IPPROTO_MAX)
 514                                 ip_protox[pr->pr_protocol] = pr;
 515                 }
 516         }
 517
 518         /* IP fragment reassembly queue lock */
 519         ipqlock_grp_attr  = lck_grp_attr_alloc_init();
 520         ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr);
 521         ipqlock_attr = lck_attr_alloc_init();
 522         lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr);
 523
 524         lck_mtx_lock(&ipqlock);
 525         /* Initialize IP reassembly queue. */
 526         for (i = 0; i < IPREASS_NHASH; i++)
 527                 TAILQ_INIT(&ipq[i]);
 528
 529         maxnipq = nmbclusters / 32;
 530         maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */
 531         ipq_updateparams();
 532         lck_mtx_unlock(&ipqlock);
 533
 534         getmicrotime(&tv);
 535         ip_id = RandomULong() ^ tv.tv_usec;
 536         ip_initid();
 537
 538         ipf_init();
 539
 540 #if IPSEC
 541         sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init();
 542         sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat",
 543             sadb_stat_mutex_grp_attr);
 544         sadb_stat_mutex_attr = lck_attr_alloc_init();
 545         lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp,
 546             sadb_stat_mutex_attr);
 547
 548 #endif
 549         arp_init();
 550         net_init_add(ip_init_delayed);
 551 }
 552
 553 /*
 554  * Initialize IPv4 source address hash table.
 555  */
 556 static void
 557 in_ifaddrhashtbl_init(void)
 558 {
 559         int i, k, p;
 560
 561         if (in_ifaddrhashtbl != NULL)
 562                 return;
 563
 564         PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash,
 565             sizeof (inaddr_nhash));
 566         if (inaddr_nhash == 0)
 567                 inaddr_nhash = INADDR_NHASH;
 568
 569         MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *,
 570             inaddr_nhash * sizeof (*in_ifaddrhashtbl),
 571             M_IFADDR, M_WAITOK | M_ZERO);
 572         if (in_ifaddrhashtbl == NULL)
 573                 panic("in_ifaddrhashtbl_init allocation failed");
 574
 575         /*
 576          * Generate the next largest prime greater than inaddr_nhash.
 577          */
 578         k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2;
 579         for (;;) {
 580                 p = 1;
 581                 for (i = 3; i * i <= k; i += 2) {
 582                         if (k % i == 0)
 583                                 p = 0;
 584                 }
 585                 if (p == 1)
 586                         break;
 587                 k += 2;
 588         }
 589         inaddr_hashp = k;
 590 }
 591
 592 u_int32_t
 593 inaddr_hashval(u_int32_t key)
 594 {
 595         /*
 596          * The hash index is the computed prime times the key modulo
 597          * the hash size, as documented in "Introduction to Algorithms"
 598          * (Cormen, Leiserson, Rivest).
 599          */
 600         if (inaddr_nhash > 1)
 601                 return ((key * inaddr_hashp) % inaddr_nhash);
 602         else
 603                 return (0);
 604 }
 605
 606 void
 607 ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto)
 608 {
 609         ip_proto_dispatch_in(m, hlen, proto, 0);
 610 }
 611
 612 __private_extern__ void
 613 ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto,
 614     ipfilter_t inject_ipfref)
 615 {
 616         struct ipfilter *filter;
 617         int seen = (inject_ipfref == NULL);
 618         int     changed_header = 0;
 619         struct ip *ip;
 620         void (*pr_input)(struct mbuf *, int len);
 621
 622         if (!TAILQ_EMPTY(&ipv4_filters)) {
 623                 ipf_ref();
 624                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 625                         if (seen == 0) {
 626                                 if ((struct ipfilter *)inject_ipfref == filter)
 627                                         seen = 1;
 628                         } else if (filter->ipf_filter.ipf_input) {
 629                                 errno_t result;
 630
 631                                 if (changed_header == 0) {
 632                                         /*
 633                                          * Perform IP header alignment fixup,
 634                                          * if needed, before passing packet
 635                                          * into filter(s).
 636                                          */
 637                                         IP_HDR_ALIGNMENT_FIXUP(m,
 638                                             m->m_pkthdr.rcvif, ipf_unref());
 639
 640                                         /* ipf_unref() already called */
 641                                         if (m == NULL)
 642                                                 return;
 643
 644                                         changed_header = 1;
 645                                         ip = mtod(m, struct ip *);
 646                                         ip->ip_len = htons(ip->ip_len + hlen);
 647                                         ip->ip_off = htons(ip->ip_off);
 648                                         ip->ip_sum = 0;
 649                                         ip->ip_sum = ip_cksum_hdr_in(m, hlen);
 650                                 }
 651                                 result = filter->ipf_filter.ipf_input(
 652                                     filter->ipf_filter.cookie, (mbuf_t *)&m,
 653                                     hlen, proto);
 654                                 if (result == EJUSTRETURN) {
 655                                         ipf_unref();
 656                                         return;
 657                                 }
 658                                 if (result != 0) {
 659                                         ipf_unref();
 660                                         m_freem(m);
 661                                         return;
 662                                 }
 663                         }
 664                 }
 665                 ipf_unref();
 666         }
 667
 668         /* Perform IP header alignment fixup (post-filters), if needed */
 669         IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return);
 670
 671         /*
 672          * If there isn't a specific lock for the protocol
 673          * we're about to call, use the generic lock for AF_INET.
 674          * otherwise let the protocol deal with its own locking
 675          */
 676         ip = mtod(m, struct ip *);
 677
 678         if (changed_header) {
 679                 ip->ip_len = ntohs(ip->ip_len) - hlen;
 680                 ip->ip_off = ntohs(ip->ip_off);
 681         }
 682
 683         if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) {
 684                 m_freem(m);
 685         } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) {
 686                 lck_mtx_lock(inet_domain_mutex);
 687                 pr_input(m, hlen);
 688                 lck_mtx_unlock(inet_domain_mutex);
 689         } else {
 690                 pr_input(m, hlen);
 691         }
 692 }
 693
 694 struct pktchain_elm {
 695         struct mbuf     *pkte_head;
 696         struct mbuf     *pkte_tail;
 697         struct in_addr  pkte_saddr;
 698         struct in_addr  pkte_daddr;
 699         uint16_t        pkte_npkts;
 700         uint16_t        pkte_proto;
 701         uint32_t        pkte_nbytes;
 702 };
 703
 704 typedef struct pktchain_elm pktchain_elm_t;
 705
 706 /* Store upto PKTTBL_SZ unique flows on the stack */
 707 #define PKTTBL_SZ       7
 708
 709 static struct mbuf *
 710 ip_chain_insert(struct mbuf *packet, pktchain_elm_t *tbl)
 711 {
 712         struct ip*      ip;
 713         int             pkttbl_idx = 0;
 714
 715         ip = mtod(packet, struct ip*);
 716
 717         /* reusing the hash function from inaddr_hashval */
 718         pkttbl_idx = inaddr_hashval(ntohs(ip->ip_src.s_addr)) % PKTTBL_SZ;
 719         if (tbl[pkttbl_idx].pkte_head == NULL) {
 720                 tbl[pkttbl_idx].pkte_head = packet;
 721                 tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr;
 722                 tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr;
 723                 tbl[pkttbl_idx].pkte_proto = ip->ip_p;
 724         } else {
 725                 if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) &&
 726                     (ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) &&
 727                     (ip->ip_p == tbl[pkttbl_idx].pkte_proto)) {
 728                 } else {
 729                         return (packet);
 730                 }
 731         }
 732         if (tbl[pkttbl_idx].pkte_tail != NULL)
 733                 mbuf_setnextpkt(tbl[pkttbl_idx].pkte_tail, packet);
 734
 735         tbl[pkttbl_idx].pkte_tail = packet;
 736         tbl[pkttbl_idx].pkte_npkts += 1;
 737         tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len;
 738         return (NULL);
 739 }
 740
 741 /* args is a dummy variable here for backward compatibility */
 742 static void
 743 ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args)
 744 {
 745         int i = 0;
 746
 747         for (i = 0; i < PKTTBL_SZ; i++) {
 748                 if (tbl[i].pkte_head != NULL) {
 749                         struct mbuf *m = tbl[i].pkte_head;
 750                         ip_input_second_pass(m, m->m_pkthdr.rcvif, 0,
 751                             tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args, 0);
 752
 753                         if (tbl[i].pkte_npkts > 2)
 754                                 ipstat.ips_rxc_chainsz_gt2++;
 755                         if (tbl[i].pkte_npkts > 4)
 756                                 ipstat.ips_rxc_chainsz_gt4++;
 757 #if (DEBUG || DEVELOPMENT)
 758                         if (ip_input_measure)
 759                                 net_perf_histogram(&net_perf, tbl[i].pkte_npkts);
 760 #endif /* (DEBUG || DEVELOPMENT) */
 761                         tbl[i].pkte_head = tbl[i].pkte_tail = NULL;
 762                         tbl[i].pkte_npkts = 0;
 763                         tbl[i].pkte_nbytes = 0;
 764                         /* no need to initialize address and protocol in tbl */
 765                 }
 766         }
 767 }
 768
 769 static void
 770 ip_input_cpout_args(struct ip_fw_in_args *args, struct ip_fw_args *args1,
 771     boolean_t *done_init)
 772 {
 773         if (*done_init == FALSE) {
 774                 bzero(args1, sizeof(struct ip_fw_args));
 775                 *done_init = TRUE;
 776         }
 777         args1->fwa_next_hop = args->fwai_next_hop;
 778         args1->fwa_ipfw_rule = args->fwai_ipfw_rule;
 779         args1->fwa_pf_rule = args->fwai_pf_rule;
 780         args1->fwa_divert_rule = args->fwai_divert_rule;
 781 }
 782
 783 static void
 784 ip_input_cpin_args(struct ip_fw_args *args1, struct ip_fw_in_args *args)
 785 {
 786         args->fwai_next_hop = args1->fwa_next_hop;
 787         args->fwai_ipfw_rule = args1->fwa_ipfw_rule;
 788         args->fwai_pf_rule = args1->fwa_pf_rule;
 789         args->fwai_divert_rule = args1->fwa_divert_rule;
 790 }
 791
 792 typedef enum {
 793         IPINPUT_DOCHAIN = 0,
 794         IPINPUT_DONTCHAIN,
 795         IPINPUT_FREED,
 796         IPINPUT_DONE
 797 } ipinput_chain_ret_t;
 798
 799 static void
 800 ip_input_update_nstat(struct ifnet *ifp, struct in_addr src_ip,
 801     u_int32_t packets, u_int32_t bytes)
 802 {
 803         if (nstat_collect) {
 804                 struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp,
 805                     src_ip);
 806                 if (rt != NULL) {
 807                         nstat_route_rx(rt, packets, bytes, 0);
 808                         rtfree(rt);
 809                 }
 810         }
 811 }
 812
 813 static void
 814 ip_input_dispatch_chain(struct mbuf *m)
 815 {
 816         struct mbuf *tmp_mbuf = m;
 817         struct mbuf *nxt_mbuf = NULL;
 818         struct ip *ip = NULL;
 819         unsigned int hlen;
 820
 821         ip = mtod(tmp_mbuf, struct ip *);
 822         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 823         while(tmp_mbuf) {
 824                 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
 825                 mbuf_setnextpkt(tmp_mbuf, NULL);
 826
 827                 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP))
 828                         tmp_mbuf = tcp_lro(tmp_mbuf, hlen);
 829                 if (tmp_mbuf)
 830                         ip_proto_dispatch_in(tmp_mbuf, hlen, ip->ip_p, 0);
 831                 tmp_mbuf = nxt_mbuf;
 832                 if (tmp_mbuf) {
 833                         ip = mtod(tmp_mbuf, struct ip *);
 834                         /* first mbuf of chain already has adjusted ip_len */
 835                         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 836                         ip->ip_len -= hlen;
 837                 }
 838         }
 839 }
 840
 841 static void
 842 ip_input_setdst_chain(struct mbuf *m, uint32_t ifindex, struct in_ifaddr *ia)
 843 {
 844         struct mbuf *tmp_mbuf = m;
 845
 846         while (tmp_mbuf) {
 847                 ip_setdstifaddr_info(tmp_mbuf, ifindex, ia);
 848                 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
 849         }
 850 }
 851
 852 static void
 853 ip_input_adjust(struct mbuf *m, struct ip *ip, struct ifnet *inifp)
 854 {
 855         boolean_t adjust = TRUE;
 856
 857         ASSERT(m_pktlen(m) > ip->ip_len);
 858
 859         /*
 860          * Invalidate hardware checksum info if ip_adj_clear_hwcksum
 861          * is set; useful to handle buggy drivers.  Note that this
 862          * should not be enabled by default, as we may get here due
 863          * to link-layer padding.
 864          */
 865         if (ip_adj_clear_hwcksum &&
 866             (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 867             !(inifp->if_flags & IFF_LOOPBACK) &&
 868             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
 869                 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 870                 m->m_pkthdr.csum_data = 0;
 871                 ipstat.ips_adj_hwcsum_clr++;
 872         }
 873
 874         /*
 875          * If partial checksum information is available, subtract
 876          * out the partial sum of postpended extraneous bytes, and
 877          * update the checksum metadata accordingly.  By doing it
 878          * here, the upper layer transport only needs to adjust any
 879          * prepended extraneous bytes (else it will do both.)
 880          */
 881         if (ip_adj_partial_sum &&
 882             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID|CSUM_PARTIAL)) ==
 883             (CSUM_DATA_VALID|CSUM_PARTIAL)) {
 884                 m->m_pkthdr.csum_rx_val = m_adj_sum16(m,
 885                     m->m_pkthdr.csum_rx_start, m->m_pkthdr.csum_rx_start,
 886                     (ip->ip_len - m->m_pkthdr.csum_rx_start),
 887                     m->m_pkthdr.csum_rx_val);
 888         } else if ((m->m_pkthdr.csum_flags &
 889             (CSUM_DATA_VALID|CSUM_PARTIAL)) ==
 890             (CSUM_DATA_VALID|CSUM_PARTIAL)) {
 891                 /*
 892                  * If packet has partial checksum info and we decided not
 893                  * to subtract the partial sum of postpended extraneous
 894                  * bytes here (not the default case), leave that work to
 895                  * be handled by the other layers.  For now, only TCP, UDP
 896                  * layers are capable of dealing with this.  For all other
 897                  * protocols (including fragments), trim and ditch the
 898                  * partial sum as those layers might not implement partial
 899                  * checksumming (or adjustment) at all.
 900                  */
 901                 if ((ip->ip_off & (IP_MF | IP_OFFMASK)) == 0 &&
 902                     (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_UDP)) {
 903                         adjust = FALSE;
 904                 } else {
 905                         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 906                         m->m_pkthdr.csum_data = 0;
 907                         ipstat.ips_adj_hwcsum_clr++;
 908                 }
 909         }
 910
 911         if (adjust) {
 912                 ipstat.ips_adj++;
 913                 if (m->m_len == m->m_pkthdr.len) {
 914                         m->m_len = ip->ip_len;
 915                         m->m_pkthdr.len = ip->ip_len;
 916                 } else {
 917                         m_adj(m, ip->ip_len - m->m_pkthdr.len);
 918                 }
 919         }
 920 }
 921
 922 /*
 923  * First pass does all essential packet validation and places on a per flow
 924  * queue for doing operations that have same outcome for all packets of a flow.
 925  * div_info is packet divert/tee info
 926  */
 927 static ipinput_chain_ret_t
 928 ip_input_first_pass(struct mbuf *m, u_int32_t *div_info,
 929     struct ip_fw_in_args *args, int *ours, struct mbuf **modm)
 930 {
 931         struct ip       *ip;
 932         struct ifnet    *inifp;
 933         unsigned int    hlen;
 934         int             retval = IPINPUT_DOCHAIN;
 935         int             len = 0;
 936         struct in_addr  src_ip;
 937 #if IPFIREWALL
 938         int             i;
 939 #endif
 940 #if IPFIREWALL || DUMMYNET
 941         struct m_tag            *copy;
 942         struct m_tag            *p;
 943         boolean_t               delete = FALSE;
 944         struct ip_fw_args       args1;
 945         boolean_t               init = FALSE;
 946 #endif
 947         ipfilter_t inject_filter_ref = NULL;
 948
 949 #if !IPFIREWALL
 950 #pragma unused (args)
 951 #endif
 952
 953 #if !IPDIVERT
 954 #pragma unused (div_info)
 955 #pragma unused (ours)
 956 #endif
 957
 958 #if !IPFIREWALL_FORWARD
 959 #pragma unused (ours)
 960 #endif
 961
 962         /* Check if the mbuf is still valid after interface filter processing */
 963         MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
 964         inifp = mbuf_pkthdr_rcvif(m);
 965         VERIFY(inifp != NULL);
 966
 967         /* Perform IP header alignment fixup, if needed */
 968         IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
 969
 970         m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
 971
 972 #if IPFIREWALL || DUMMYNET
 973
 974         /*
 975          * Don't bother searching for tag(s) if there's none.
 976          */
 977         if (SLIST_EMPTY(&m->m_pkthdr.tags))
 978                 goto ipfw_tags_done;
 979
 980         /* Grab info from mtags prepended to the chain */
 981         p = m_tag_first(m);
 982         while (p) {
 983                 if (p->m_tag_id == KERNEL_MODULE_TAG_ID) {
 984 #if DUMMYNET
 985                         if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) {
 986                                 struct dn_pkt_tag *dn_tag;
 987
 988                                 dn_tag = (struct dn_pkt_tag *)(p+1);
 989                                 args->fwai_ipfw_rule = dn_tag->dn_ipfw_rule;
 990                                 args->fwai_pf_rule = dn_tag->dn_pf_rule;
 991                                 delete = TRUE;
 992                         }
 993 #endif
 994
 995 #if IPDIVERT
 996                         if (p->m_tag_type == KERNEL_TAG_TYPE_DIVERT) {
 997                                 struct divert_tag *div_tag;
 998
 999                                 div_tag = (struct divert_tag *)(p+1);
1000                                 args->fwai_divert_rule = div_tag->cookie;
1001                                 delete = TRUE;
1002                         }
1003 #endif
1004
1005                         if (p->m_tag_type == KERNEL_TAG_TYPE_IPFORWARD) {
1006                                 struct ip_fwd_tag *ipfwd_tag;
1007
1008                                 ipfwd_tag = (struct ip_fwd_tag *)(p+1);
1009                                 args->fwai_next_hop = ipfwd_tag->next_hop;
1010                                 delete = TRUE;
1011                         }
1012
1013                         if (delete) {
1014                                 copy = p;
1015                                 p = m_tag_next(m, p);
1016                                 m_tag_delete(m, copy);
1017                         } else  {
1018                                 p = m_tag_next(m, p);
1019                         }
1020                 } else {
1021                         p = m_tag_next(m, p);
1022                 }
1023         }
1024
1025 #if DIAGNOSTIC
1026         if (m == NULL || !(m->m_flags & M_PKTHDR))
1027                 panic("ip_input no HDR");
1028 #endif
1029
1030 #if DUMMYNET
1031         if (args->fwai_ipfw_rule || args->fwai_pf_rule) {
1032                 /* dummynet already filtered us */
1033                 ip = mtod(m, struct ip *);
1034                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1035                 inject_filter_ref = ipf_get_inject_filter(m);
1036 #if IPFIREWALL
1037                 if (args->fwai_ipfw_rule)
1038                         goto iphack;
1039 #endif /* IPFIREWALL */
1040                 if (args->fwai_pf_rule)
1041                         goto check_with_pf;
1042         }
1043 #endif /* DUMMYNET */
1044 ipfw_tags_done:
1045 #endif /* IPFIREWALL || DUMMYNET */
1046
1047         /*
1048          * No need to process packet twice if we've already seen it.
1049          */
1050         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
1051                 inject_filter_ref = ipf_get_inject_filter(m);
1052         if (inject_filter_ref != NULL) {
1053                 ip = mtod(m, struct ip *);
1054                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1055
1056                 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1057                     struct ip *, ip, struct ifnet *, inifp,
1058                     struct ip *, ip, struct ip6_hdr *, NULL);
1059
1060                 ip->ip_len = ntohs(ip->ip_len) - hlen;
1061                 ip->ip_off = ntohs(ip->ip_off);
1062                 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1063                 return (IPINPUT_DONE);
1064         }
1065
1066         if (m->m_pkthdr.len < sizeof (struct ip)) {
1067                 OSAddAtomic(1, &ipstat.ips_total);
1068                 OSAddAtomic(1, &ipstat.ips_tooshort);
1069                 m_freem(m);
1070                 return (IPINPUT_FREED);
1071         }
1072
1073         if (m->m_len < sizeof (struct ip) &&
1074             (m = m_pullup(m, sizeof (struct ip))) == NULL) {
1075                 OSAddAtomic(1, &ipstat.ips_total);
1076                 OSAddAtomic(1, &ipstat.ips_toosmall);
1077                 return (IPINPUT_FREED);
1078         }
1079
1080         ip = mtod(m, struct ip *);
1081         *modm = m;
1082
1083         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1084             ip->ip_p, ip->ip_off, ip->ip_len);
1085
1086         if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1087                 OSAddAtomic(1, &ipstat.ips_total);
1088                 OSAddAtomic(1, &ipstat.ips_badvers);
1089                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1090                 m_freem(m);
1091                 return (IPINPUT_FREED);
1092         }
1093
1094         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1095         if (hlen < sizeof (struct ip)) {
1096                 OSAddAtomic(1, &ipstat.ips_total);
1097                 OSAddAtomic(1, &ipstat.ips_badhlen);
1098                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1099                 m_freem(m);
1100                 return (IPINPUT_FREED);
1101         }
1102
1103         if (hlen > m->m_len) {
1104                 if ((m = m_pullup(m, hlen)) == NULL) {
1105                         OSAddAtomic(1, &ipstat.ips_total);
1106                         OSAddAtomic(1, &ipstat.ips_badhlen);
1107                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1108                         return (IPINPUT_FREED);
1109                 }
1110                 ip = mtod(m, struct ip *);
1111                 *modm = m;
1112         }
1113
1114         /* 127/8 must not appear on wire - RFC1122 */
1115         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1116             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1117                 /*
1118                  * Allow for the following exceptions:
1119                  *
1120                  *   1. If the packet was sent to loopback (i.e. rcvif
1121                  *      would have been set earlier at output time.)
1122                  *
1123                  *   2. If the packet was sent out on loopback from a local
1124                  *      source address which belongs to a non-loopback
1125                  *      interface (i.e. rcvif may not necessarily be a
1126                  *      loopback interface, hence the test for PKTF_LOOP.)
1127                  *      Unlike IPv6, there is no interface scope ID, and
1128                  *      therefore we don't care so much about PKTF_IFINFO.
1129                  */
1130                 if (!(inifp->if_flags & IFF_LOOPBACK) &&
1131                      !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1132                         OSAddAtomic(1, &ipstat.ips_total);
1133                         OSAddAtomic(1, &ipstat.ips_badaddr);
1134                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1135                         m_freem(m);
1136                         return (IPINPUT_FREED);
1137                 }
1138         }
1139
1140         /* IPv4 Link-Local Addresses as defined in RFC3927 */
1141         if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
1142             IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1143                 ip_linklocal_stat.iplls_in_total++;
1144                 if (ip->ip_ttl != MAXTTL) {
1145                         OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
1146                         /* Silently drop link local traffic with bad TTL */
1147                         if (!ip_linklocal_in_allowbadttl) {
1148                                 OSAddAtomic(1, &ipstat.ips_total);
1149                                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1150                                 m_freem(m);
1151                                 return (IPINPUT_FREED);
1152                         }
1153                 }
1154         }
1155
1156         if (ip_cksum(m, hlen)) {
1157                 OSAddAtomic(1, &ipstat.ips_total);
1158                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1159                 m_freem(m);
1160                 return (IPINPUT_FREED);
1161         }
1162
1163         DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1164             struct ip *, ip, struct ifnet *, inifp,
1165             struct ip *, ip, struct ip6_hdr *, NULL);
1166
1167         /*
1168          * Convert fields to host representation.
1169          */
1170 #if BYTE_ORDER != BIG_ENDIAN
1171         NTOHS(ip->ip_len);
1172 #endif
1173
1174         if (ip->ip_len < hlen) {
1175                 OSAddAtomic(1, &ipstat.ips_total);
1176                 OSAddAtomic(1, &ipstat.ips_badlen);
1177                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1178                 m_freem(m);
1179                 return (IPINPUT_FREED);
1180         }
1181
1182 #if BYTE_ORDER != BIG_ENDIAN
1183         NTOHS(ip->ip_off);
1184 #endif
1185
1186         /*
1187          * Check that the amount of data in the buffers
1188          * is as at least much as the IP header would have us expect.
1189          * Trim mbufs if longer than we expect.
1190          * Drop packet if shorter than we expect.
1191          */
1192         if (m->m_pkthdr.len < ip->ip_len) {
1193                 OSAddAtomic(1, &ipstat.ips_total);
1194                 OSAddAtomic(1, &ipstat.ips_tooshort);
1195                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1196                 m_freem(m);
1197                 return (IPINPUT_FREED);
1198         }
1199
1200         if (m->m_pkthdr.len > ip->ip_len) {
1201                 ip_input_adjust(m, ip, inifp);
1202         }
1203
1204         /* for consistency */
1205         m->m_pkthdr.pkt_proto = ip->ip_p;
1206
1207         /* for netstat route statistics */
1208         src_ip = ip->ip_src;
1209         len = m->m_pkthdr.len;
1210
1211 #if DUMMYNET
1212 check_with_pf:
1213 #endif
1214 #if PF
1215         /* Invoke inbound packet filter */
1216         if (PF_IS_ENABLED) {
1217                 int error;
1218                 ip_input_cpout_args(args, &args1, &init);
1219                 ip = mtod(m, struct ip *);
1220                 src_ip = ip->ip_src;
1221
1222 #if DUMMYNET
1223                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1);
1224 #else
1225                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1226 #endif /* DUMMYNET */
1227                 if (error != 0 || m == NULL) {
1228                         if (m != NULL) {
1229                                 panic("%s: unexpected packet %p\n",
1230                                     __func__, m);
1231                                 /* NOTREACHED */
1232                         }
1233                         /* Already freed by callee */
1234                         ip_input_update_nstat(inifp, src_ip, 1, len);
1235                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1236                         OSAddAtomic(1, &ipstat.ips_total);
1237                         return (IPINPUT_FREED);
1238                 }
1239                 ip = mtod(m, struct ip *);
1240                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1241                 *modm = m;
1242                 ip_input_cpin_args(&args1, args);
1243         }
1244 #endif /* PF */
1245
1246 #if IPSEC
1247         if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
1248                 retval = IPINPUT_DONTCHAIN; /* XXX scope for chaining here? */
1249                 goto pass;
1250         }
1251 #endif
1252
1253 #if IPFIREWALL
1254 #if DUMMYNET
1255 iphack:
1256 #endif /* DUMMYNET */
1257         /*
1258          * Check if we want to allow this packet to be processed.
1259          * Consider it to be bad if not.
1260          */
1261         if (fw_enable && IPFW_LOADED) {
1262 #if IPFIREWALL_FORWARD
1263                 /*
1264                  * If we've been forwarded from the output side, then
1265                  * skip the firewall a second time
1266                  */
1267                 if (args->fwai_next_hop) {
1268                         *ours = 1;
1269                         return (IPINPUT_DONTCHAIN);
1270                 }
1271 #endif  /* IPFIREWALL_FORWARD */
1272                 ip_input_cpout_args(args, &args1, &init);
1273                 args1.fwa_m = m;
1274
1275                 i = ip_fw_chk_ptr(&args1);
1276                 m = args1.fwa_m;
1277
1278                 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
1279                         if (m)
1280                                 m_freem(m);
1281                         ip_input_update_nstat(inifp, src_ip, 1, len);
1282                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1283                         OSAddAtomic(1, &ipstat.ips_total);
1284                         return (IPINPUT_FREED);
1285                 }
1286                 ip = mtod(m, struct ip *); /* just in case m changed */
1287                 *modm = m;
1288                 ip_input_cpin_args(&args1, args);
1289
1290                 if (i == 0 && args->fwai_next_hop == NULL) { /* common case */
1291                         goto pass;
1292                 }
1293 #if DUMMYNET
1294                 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
1295                         /* Send packet to the appropriate pipe */
1296                         ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args1,
1297                             DN_CLIENT_IPFW);
1298                         ip_input_update_nstat(inifp, src_ip, 1, len);
1299                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1300                         OSAddAtomic(1, &ipstat.ips_total);
1301                         return (IPINPUT_FREED);
1302                 }
1303 #endif /* DUMMYNET */
1304 #if IPDIVERT
1305                 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
1306                         /* Divert or tee packet */
1307                         *div_info = i;
1308                         *ours = 1;
1309                         return (IPINPUT_DONTCHAIN);
1310                 }
1311 #endif
1312 #if IPFIREWALL_FORWARD
1313                 if (i == 0 && args->fwai_next_hop != NULL) {
1314                         retval = IPINPUT_DONTCHAIN;
1315                         goto pass;
1316                 }
1317 #endif
1318                 /*
1319                  * if we get here, the packet must be dropped
1320                  */
1321                 ip_input_update_nstat(inifp, src_ip, 1, len);
1322                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1323                 m_freem(m);
1324                 OSAddAtomic(1, &ipstat.ips_total);
1325                 return (IPINPUT_FREED);
1326         }
1327 #endif /* IPFIREWALL */
1328 #if IPSEC | IPFIREWALL
1329 pass:
1330 #endif
1331         /*
1332          * Process options and, if not destined for us,
1333          * ship it on.  ip_dooptions returns 1 when an
1334          * error was detected (causing an icmp message
1335          * to be sent and the original packet to be freed).
1336          */
1337         ip_nhops = 0;           /* for source routed packets */
1338 #if IPFIREWALL
1339         if (hlen > sizeof (struct ip) &&
1340             ip_dooptions(m, 0, args->fwai_next_hop)) {
1341 #else /* !IPFIREWALL */
1342         if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) {
1343 #endif /* !IPFIREWALL */
1344                 ip_input_update_nstat(inifp, src_ip, 1, len);
1345                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1346                 OSAddAtomic(1, &ipstat.ips_total);
1347                 return (IPINPUT_FREED);
1348         }
1349
1350         /*
1351          * Don't chain fragmented packets as the process of determining
1352          * if it is our fragment or someone else's plus the complexity of
1353          * divert and fw args makes it harder to do chaining.
1354          */
1355         if (ip->ip_off & ~(IP_DF | IP_RF))
1356                 return (IPINPUT_DONTCHAIN);
1357
1358         /* Allow DHCP/BootP responses through */
1359         if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1360             hlen == sizeof (struct ip) && ip->ip_p == IPPROTO_UDP) {
1361                 struct udpiphdr *ui;
1362
1363                 if (m->m_len < sizeof (struct udpiphdr) &&
1364                     (m = m_pullup(m, sizeof (struct udpiphdr))) == NULL) {
1365                         OSAddAtomic(1, &udpstat.udps_hdrops);
1366                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1367                         OSAddAtomic(1, &ipstat.ips_total);
1368                         return (IPINPUT_FREED);
1369                 }
1370                 *modm = m;
1371                 ui = mtod(m, struct udpiphdr *);
1372                 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1373                         ip_setdstifaddr_info(m, inifp->if_index, NULL);
1374                         return (IPINPUT_DONTCHAIN);
1375                 }
1376         }
1377
1378         /* Avoid chaining raw sockets as ipsec checks occur later for them */
1379         if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)
1380                 return (IPINPUT_DONTCHAIN);
1381
1382         return (retval);
1383 #if !defined(__i386__) && !defined(__x86_64__)
1384 bad:
1385         m_freem(m);
1386         return (IPINPUT_FREED);
1387 #endif
1388 }
1389
1390 static void
1391 ip_input_second_pass(struct mbuf *m, struct ifnet *inifp, u_int32_t div_info,
1392     int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args, int ours)
1393 {
1394         unsigned int            checkif;
1395         struct mbuf             *tmp_mbuf = NULL;
1396         struct in_ifaddr        *ia = NULL;
1397         struct in_addr          pkt_dst;
1398         unsigned int            hlen;
1399
1400 #if !IPFIREWALL
1401 #pragma unused (args)
1402 #endif
1403
1404 #if !IPDIVERT
1405 #pragma unused (div_info)
1406 #endif
1407
1408         struct ip *ip = mtod(m, struct ip *);
1409         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1410
1411         OSAddAtomic(npkts_in_chain, &ipstat.ips_total);
1412
1413         /*
1414          * Naively assume we can attribute inbound data to the route we would
1415          * use to send to this destination. Asymmetric routing breaks this
1416          * assumption, but it still allows us to account for traffic from
1417          * a remote node in the routing table.
1418          * this has a very significant performance impact so we bypass
1419          * if nstat_collect is disabled. We may also bypass if the
1420          * protocol is tcp in the future because tcp will have a route that
1421          * we can use to attribute the data to. That does mean we would not
1422          * account for forwarded tcp traffic.
1423          */
1424         ip_input_update_nstat(inifp, ip->ip_src, npkts_in_chain,
1425             bytes_in_chain);
1426
1427         if (ours)
1428                 goto ours;
1429
1430         /*
1431          * Check our list of addresses, to see if the packet is for us.
1432          * If we don't have any addresses, assume any unicast packet
1433          * we receive might be for us (and let the upper layers deal
1434          * with it).
1435          */
1436         tmp_mbuf = m;
1437         if (TAILQ_EMPTY(&in_ifaddrhead)) {
1438                 while (tmp_mbuf) {
1439                         if (!(tmp_mbuf->m_flags & (M_MCAST|M_BCAST))) {
1440                                 ip_setdstifaddr_info(tmp_mbuf, inifp->if_index,
1441                                     NULL);
1442                         }
1443                         tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
1444                 }
1445                 goto ours;
1446         }
1447         /*
1448          * Cache the destination address of the packet; this may be
1449          * changed by use of 'ipfw fwd'.
1450          */
1451 #if IPFIREWALL
1452         pkt_dst = args->fwai_next_hop == NULL ?
1453             ip->ip_dst : args->fwai_next_hop->sin_addr;
1454 #else /* !IPFIREWALL */
1455         pkt_dst = ip->ip_dst;
1456 #endif /* !IPFIREWALL */
1457
1458         /*
1459          * Enable a consistency check between the destination address
1460          * and the arrival interface for a unicast packet (the RFC 1122
1461          * strong ES model) if IP forwarding is disabled and the packet
1462          * is not locally generated and the packet is not subject to
1463          * 'ipfw fwd'.
1464          *
1465          * XXX - Checking also should be disabled if the destination
1466          * address is ipnat'ed to a different interface.
1467          *
1468          * XXX - Checking is incompatible with IP aliases added
1469          * to the loopback interface instead of the interface where
1470          * the packets are received.
1471          */
1472         checkif = ip_checkinterface && (ipforwarding == 0) &&
1473             !(inifp->if_flags & IFF_LOOPBACK) &&
1474             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)
1475 #if IPFIREWALL
1476             && (args->fwai_next_hop == NULL);
1477 #else /* !IPFIREWALL */
1478                 ;
1479 #endif /* !IPFIREWALL */
1480
1481         /*
1482          * Check for exact addresses in the hash bucket.
1483          */
1484         lck_rw_lock_shared(in_ifaddr_rwlock);
1485         TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
1486                 /*
1487                  * If the address matches, verify that the packet
1488                  * arrived via the correct interface if checking is
1489                  * enabled.
1490                  */
1491                 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
1492                     (!checkif || ia->ia_ifp == inifp)) {
1493                         ip_input_setdst_chain(m, 0, ia);
1494                         lck_rw_done(in_ifaddr_rwlock);
1495                         goto ours;
1496                 }
1497         }
1498         lck_rw_done(in_ifaddr_rwlock);
1499
1500         /*
1501          * Check for broadcast addresses.
1502          *
1503          * Only accept broadcast packets that arrive via the matching
1504          * interface.  Reception of forwarded directed broadcasts would be
1505          * handled via ip_forward() and ether_frameout() with the loopback
1506          * into the stack for SIMPLEX interfaces handled by ether_frameout().
1507          */
1508         if (inifp->if_flags & IFF_BROADCAST) {
1509                 struct ifaddr *ifa;
1510
1511                 ifnet_lock_shared(inifp);
1512                 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
1513                         if (ifa->ifa_addr->sa_family != AF_INET) {
1514                                 continue;
1515                         }
1516                         ia = ifatoia(ifa);
1517                         if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
1518                             pkt_dst.s_addr || ia->ia_netbroadcast.s_addr ==
1519                             pkt_dst.s_addr) {
1520                                 ip_input_setdst_chain(m, 0, ia);
1521                                 ifnet_lock_done(inifp);
1522                                 goto ours;
1523                         }
1524                 }
1525                 ifnet_lock_done(inifp);
1526         }
1527
1528         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1529                 struct in_multi *inm;
1530                 /*
1531                  * See if we belong to the destination multicast group on the
1532                  * arrival interface.
1533                  */
1534                 in_multihead_lock_shared();
1535                 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
1536                 in_multihead_lock_done();
1537                 if (inm == NULL) {
1538                         OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember);
1539                         m_freem_list(m);
1540                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1541                         return;
1542                 }
1543                 ip_input_setdst_chain(m, inifp->if_index, NULL);
1544                 INM_REMREF(inm);
1545                 goto ours;
1546         }
1547
1548         if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
1549             ip->ip_dst.s_addr == INADDR_ANY) {
1550                 ip_input_setdst_chain(m, inifp->if_index, NULL);
1551                 goto ours;
1552         }
1553
1554         if (ip->ip_p == IPPROTO_UDP) {
1555                 struct udpiphdr *ui;
1556                 ui = mtod(m, struct udpiphdr *);
1557                 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1558                         goto ours;
1559                 }
1560         }
1561
1562         tmp_mbuf = m;
1563         struct mbuf *nxt_mbuf = NULL;
1564         while (tmp_mbuf) {
1565                 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
1566                 /*
1567                  * Not for us; forward if possible and desirable.
1568                  */
1569                 mbuf_setnextpkt(tmp_mbuf, NULL);
1570                 if (ipforwarding == 0) {
1571                         OSAddAtomic(1, &ipstat.ips_cantforward);
1572                         m_freem(tmp_mbuf);
1573                 } else {
1574 #if IPFIREWALL
1575                         ip_forward(tmp_mbuf, 0, args->fwai_next_hop);
1576 #else
1577                         ip_forward(tmp_mbuf, 0, NULL);
1578 #endif
1579                 }
1580                 tmp_mbuf = nxt_mbuf;
1581         }
1582         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1583         return;
1584 ours:
1585         /*
1586          * If offset or IP_MF are set, must reassemble.
1587          */
1588         if (ip->ip_off & ~(IP_DF | IP_RF)) {
1589                 VERIFY(npkts_in_chain == 1);
1590                 /*
1591                  * ip_reass() will return a different mbuf, and update
1592                  * the divert info in div_info and args->fwai_divert_rule.
1593                  */
1594 #if IPDIVERT
1595                 m = ip_reass(m, (u_int16_t *)&div_info, &args->fwai_divert_rule);
1596 #else
1597                 m = ip_reass(m);
1598 #endif
1599                 if (m == NULL)
1600                         return;
1601                 ip = mtod(m, struct ip *);
1602                 /* Get the header length of the reassembled packet */
1603                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1604 #if IPDIVERT
1605                 /* Restore original checksum before diverting packet */
1606                 if (div_info != 0) {
1607                         VERIFY(npkts_in_chain == 1);
1608 #if BYTE_ORDER != BIG_ENDIAN
1609                         HTONS(ip->ip_len);
1610                         HTONS(ip->ip_off);
1611 #endif
1612                         ip->ip_sum = 0;
1613                         ip->ip_sum = ip_cksum_hdr_in(m, hlen);
1614 #if BYTE_ORDER != BIG_ENDIAN
1615                         NTOHS(ip->ip_off);
1616                         NTOHS(ip->ip_len);
1617 #endif
1618                 }
1619 #endif
1620         }
1621
1622         /*
1623          * Further protocols expect the packet length to be w/o the
1624          * IP header.
1625          */
1626         ip->ip_len -= hlen;
1627
1628 #if IPDIVERT
1629         /*
1630          * Divert or tee packet to the divert protocol if required.
1631          *
1632          * If div_info is zero then cookie should be too, so we shouldn't
1633          * need to clear them here.  Assume divert_packet() does so also.
1634          */
1635         if (div_info != 0) {
1636                 struct mbuf *clone = NULL;
1637                 VERIFY(npkts_in_chain == 1);
1638
1639                 /* Clone packet if we're doing a 'tee' */
1640                 if (div_info & IP_FW_PORT_TEE_FLAG)
1641                         clone = m_dup(m, M_DONTWAIT);
1642
1643                 /* Restore packet header fields to original values */
1644                 ip->ip_len += hlen;
1645
1646 #if BYTE_ORDER != BIG_ENDIAN
1647                 HTONS(ip->ip_len);
1648                 HTONS(ip->ip_off);
1649 #endif
1650                 /* Deliver packet to divert input routine */
1651                 OSAddAtomic(1, &ipstat.ips_delivered);
1652                 divert_packet(m, 1, div_info & 0xffff, args->fwai_divert_rule);
1653
1654                 /* If 'tee', continue with original packet */
1655                 if (clone == NULL) {
1656                         return;
1657                 }
1658                 m = clone;
1659                 ip = mtod(m, struct ip *);
1660         }
1661 #endif
1662
1663 #if IPSEC
1664         /*
1665          * enforce IPsec policy checking if we are seeing last header.
1666          * note that we do not visit this with protocols with pcb layer
1667          * code - like udp/tcp/raw ip.
1668          */
1669         if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
1670                 VERIFY(npkts_in_chain == 1);
1671                 if (ipsec4_in_reject(m, NULL)) {
1672                         IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1673                         goto bad;
1674                 }
1675         }
1676 #endif /* IPSEC */
1677
1678         /*
1679          * Switch out to protocol's input routine.
1680          */
1681         OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered);
1682
1683 #if IPFIREWALL
1684         if (args->fwai_next_hop && ip->ip_p == IPPROTO_TCP) {
1685                 /* TCP needs IPFORWARD info if available */
1686                 struct m_tag *fwd_tag;
1687                 struct ip_fwd_tag *ipfwd_tag;
1688
1689                 VERIFY(npkts_in_chain == 1);
1690                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1691                     KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag),
1692                     M_NOWAIT, m);
1693                 if (fwd_tag == NULL)
1694                         goto bad;
1695
1696                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1697                 ipfwd_tag->next_hop = args->fwai_next_hop;
1698
1699                 m_tag_prepend(m, fwd_tag);
1700
1701                 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1702                     ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1703
1704                 /* TCP deals with its own locking */
1705                 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
1706         } else {
1707                 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1708                     ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1709
1710                 ip_input_dispatch_chain(m);
1711
1712         }
1713 #else /* !IPFIREWALL */
1714         ip_input_dispatch_chain(m);
1715
1716 #endif /* !IPFIREWALL */
1717         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1718         return;
1719 bad:
1720         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1721         m_freem(m);
1722 }
1723
1724 void
1725 ip_input_process_list(struct mbuf *packet_list)
1726 {
1727         pktchain_elm_t  pktchain_tbl[PKTTBL_SZ];
1728
1729         struct mbuf     *packet = NULL;
1730         struct mbuf     *modm = NULL; /* modified mbuf */
1731         int             retval = 0;
1732         u_int32_t       div_info = 0;
1733         int             ours = 0;
1734 #if (DEBUG || DEVELOPMENT)
1735         struct timeval start_tv;
1736 #endif /* (DEBUG || DEVELOPMENT) */
1737         int     num_pkts = 0;
1738         int chain = 0;
1739         struct ip_fw_in_args       args;
1740
1741         if (ip_chaining == 0) {
1742                 struct mbuf *m = packet_list;
1743 #if (DEBUG || DEVELOPMENT)
1744                 if (ip_input_measure)
1745                         net_perf_start_time(&net_perf, &start_tv);
1746 #endif /* (DEBUG || DEVELOPMENT) */
1747
1748                 while (m) {
1749                         packet_list = mbuf_nextpkt(m);
1750                         mbuf_setnextpkt(m, NULL);
1751                         ip_input(m);
1752                         m = packet_list;
1753                         num_pkts++;
1754                 }
1755 #if (DEBUG || DEVELOPMENT)
1756                 if (ip_input_measure)
1757                         net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1758 #endif /* (DEBUG || DEVELOPMENT) */
1759                 return;
1760         }
1761 #if (DEBUG || DEVELOPMENT)
1762         if (ip_input_measure)
1763                 net_perf_start_time(&net_perf, &start_tv);
1764 #endif /* (DEBUG || DEVELOPMENT) */
1765
1766         bzero(&pktchain_tbl, sizeof(pktchain_tbl));
1767 restart_list_process:
1768         chain = 0;
1769         for (packet = packet_list; packet; packet = packet_list) {
1770                 packet_list = mbuf_nextpkt(packet);
1771                 mbuf_setnextpkt(packet, NULL);
1772
1773                 num_pkts++;
1774                 modm = NULL;
1775                 div_info = 0;
1776                 bzero(&args, sizeof (args));
1777
1778                 retval = ip_input_first_pass(packet, &div_info, &args,
1779                     &ours, &modm);
1780
1781                 if (retval == IPINPUT_DOCHAIN) {
1782                         if (modm)
1783                                 packet = modm;
1784                         packet = ip_chain_insert(packet, &pktchain_tbl[0]);
1785                         if (packet == NULL) {
1786                                 ipstat.ips_rxc_chained++;
1787                                 chain++;
1788                                 if (chain > ip_chainsz)
1789                                         break;
1790                         } else {
1791                                 ipstat.ips_rxc_collisions++;
1792                                 break;
1793                         }
1794                 } else if (retval == IPINPUT_DONTCHAIN) {
1795                         /* in order to preserve order, exit from chaining */
1796                         if (modm)
1797                                 packet = modm;
1798                         ipstat.ips_rxc_notchain++;
1799                         break;
1800                 } else {
1801                         /* packet was freed or delivered, do nothing. */
1802                 }
1803         }
1804
1805         /* do second pass here for pktchain_tbl */
1806         if (chain)
1807                 ip_input_second_pass_loop_tbl(&pktchain_tbl[0], &args);
1808
1809         if (packet) {
1810                 /*
1811                  * equivalent update in chaining case if performed in
1812                  * ip_input_second_pass_loop_tbl().
1813                  */
1814 #if (DEBUG || DEVELOPMENT)
1815                 if (ip_input_measure)
1816                         net_perf_histogram(&net_perf, 1);
1817 #endif /* (DEBUG || DEVELOPMENT) */
1818                 ip_input_second_pass(packet, packet->m_pkthdr.rcvif, div_info,
1819                     1, packet->m_pkthdr.len, &args, ours);
1820         }
1821
1822         if (packet_list)
1823                 goto restart_list_process;
1824
1825 #if (DEBUG || DEVELOPMENT)
1826         if (ip_input_measure)
1827                 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1828 #endif /* (DEBUG || DEVELOPMENT) */
1829 }
1830 /*
1831  * Ip input routine.  Checksum and byte swap header.  If fragmented
1832  * try to reassemble.  Process options.  Pass to next level.
1833  */
1834 void
1835 ip_input(struct mbuf *m)
1836 {
1837         struct ip *ip;
1838         struct in_ifaddr *ia = NULL;
1839         unsigned int hlen, checkif;
1840         u_short sum = 0;
1841         struct in_addr pkt_dst;
1842 #if IPFIREWALL
1843         int i;
1844         u_int32_t div_info = 0;         /* packet divert/tee info */
1845 #endif
1846 #if IPFIREWALL || DUMMYNET
1847         struct ip_fw_args args;
1848         struct m_tag    *tag;
1849 #endif
1850         ipfilter_t inject_filter_ref = NULL;
1851         struct ifnet *inifp;
1852
1853         /* Check if the mbuf is still valid after interface filter processing */
1854         MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
1855         inifp = m->m_pkthdr.rcvif;
1856         VERIFY(inifp != NULL);
1857
1858         ipstat.ips_rxc_notlist++;
1859
1860         /* Perform IP header alignment fixup, if needed */
1861         IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
1862
1863         m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
1864
1865 #if IPFIREWALL || DUMMYNET
1866         bzero(&args, sizeof (struct ip_fw_args));
1867
1868         /*
1869          * Don't bother searching for tag(s) if there's none.
1870          */
1871         if (SLIST_EMPTY(&m->m_pkthdr.tags))
1872                 goto ipfw_tags_done;
1873
1874         /* Grab info from mtags prepended to the chain */
1875 #if DUMMYNET
1876         if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1877             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
1878                 struct dn_pkt_tag *dn_tag;
1879
1880                 dn_tag = (struct dn_pkt_tag *)(tag+1);
1881                 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
1882                 args.fwa_pf_rule = dn_tag->dn_pf_rule;
1883
1884                 m_tag_delete(m, tag);
1885         }
1886 #endif /* DUMMYNET */
1887
1888 #if IPDIVERT
1889         if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1890             KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
1891                 struct divert_tag *div_tag;
1892
1893                 div_tag = (struct divert_tag *)(tag+1);
1894                 args.fwa_divert_rule = div_tag->cookie;
1895
1896                 m_tag_delete(m, tag);
1897         }
1898 #endif
1899
1900         if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1901             KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
1902                 struct ip_fwd_tag *ipfwd_tag;
1903
1904                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
1905                 args.fwa_next_hop = ipfwd_tag->next_hop;
1906
1907                 m_tag_delete(m, tag);
1908         }
1909
1910 #if     DIAGNOSTIC
1911         if (m == NULL || !(m->m_flags & M_PKTHDR))
1912                 panic("ip_input no HDR");
1913 #endif
1914
1915 #if DUMMYNET
1916         if (args.fwa_ipfw_rule || args.fwa_pf_rule) {
1917                 /* dummynet already filtered us */
1918                 ip = mtod(m, struct ip *);
1919                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1920                 inject_filter_ref = ipf_get_inject_filter(m);
1921 #if IPFIREWALL
1922                 if (args.fwa_ipfw_rule)
1923                         goto iphack;
1924 #endif /* IPFIREWALL */
1925                 if (args.fwa_pf_rule)
1926                         goto check_with_pf;
1927         }
1928 #endif /* DUMMYNET */
1929 ipfw_tags_done:
1930 #endif /* IPFIREWALL || DUMMYNET */
1931
1932         /*
1933          * No need to process packet twice if we've already seen it.
1934          */
1935         if (!SLIST_EMPTY(&m->m_pkthdr.tags))
1936                 inject_filter_ref = ipf_get_inject_filter(m);
1937         if (inject_filter_ref != NULL) {
1938                 ip = mtod(m, struct ip *);
1939                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1940
1941                 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1942                     struct ip *, ip, struct ifnet *, inifp,
1943                     struct ip *, ip, struct ip6_hdr *, NULL);
1944
1945                 ip->ip_len = ntohs(ip->ip_len) - hlen;
1946                 ip->ip_off = ntohs(ip->ip_off);
1947                 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1948                 return;
1949         }
1950
1951         OSAddAtomic(1, &ipstat.ips_total);
1952         if (m->m_pkthdr.len < sizeof (struct ip))
1953                 goto tooshort;
1954
1955         if (m->m_len < sizeof (struct ip) &&
1956             (m = m_pullup(m, sizeof (struct ip))) == NULL) {
1957                 OSAddAtomic(1, &ipstat.ips_toosmall);
1958                 return;
1959         }
1960         ip = mtod(m, struct ip *);
1961
1962         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1963             ip->ip_p, ip->ip_off, ip->ip_len);
1964
1965         if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1966                 OSAddAtomic(1, &ipstat.ips_badvers);
1967                 goto bad;
1968         }
1969
1970         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1971         if (hlen < sizeof (struct ip)) {        /* minimum header length */
1972                 OSAddAtomic(1, &ipstat.ips_badhlen);
1973                 goto bad;
1974         }
1975         if (hlen > m->m_len) {
1976                 if ((m = m_pullup(m, hlen)) == NULL) {
1977                         OSAddAtomic(1, &ipstat.ips_badhlen);
1978                         return;
1979                 }
1980                 ip = mtod(m, struct ip *);
1981         }
1982
1983         /* 127/8 must not appear on wire - RFC1122 */
1984         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1985             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1986                 /*
1987                  * Allow for the following exceptions:
1988                  *
1989                  *   1. If the packet was sent to loopback (i.e. rcvif
1990                  *      would have been set earlier at output time.)
1991                  *
1992                  *   2. If the packet was sent out on loopback from a local
1993                  *      source address which belongs to a non-loopback
1994                  *      interface (i.e. rcvif may not necessarily be a
1995                  *      loopback interface, hence the test for PKTF_LOOP.)
1996                  *      Unlike IPv6, there is no interface scope ID, and
1997                  *      therefore we don't care so much about PKTF_IFINFO.
1998                  */
1999                 if (!(inifp->if_flags & IFF_LOOPBACK) &&
2000                     !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2001                         OSAddAtomic(1, &ipstat.ips_badaddr);
2002                         goto bad;
2003                 }
2004         }
2005
2006         /* IPv4 Link-Local Addresses as defined in RFC3927 */
2007         if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
2008             IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
2009                 ip_linklocal_stat.iplls_in_total++;
2010                 if (ip->ip_ttl != MAXTTL) {
2011                         OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
2012                         /* Silently drop link local traffic with bad TTL */
2013                         if (!ip_linklocal_in_allowbadttl)
2014                                 goto bad;
2015                 }
2016         }
2017
2018         sum = ip_cksum(m, hlen);
2019         if (sum) {
2020                 goto bad;
2021         }
2022
2023         DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
2024             struct ip *, ip, struct ifnet *, inifp,
2025             struct ip *, ip, struct ip6_hdr *, NULL);
2026
2027         /*
2028          * Naively assume we can attribute inbound data to the route we would
2029          * use to send to this destination. Asymmetric routing breaks this
2030          * assumption, but it still allows us to account for traffic from
2031          * a remote node in the routing table.
2032          * this has a very significant performance impact so we bypass
2033          * if nstat_collect is disabled. We may also bypass if the
2034          * protocol is tcp in the future because tcp will have a route that
2035          * we can use to attribute the data to. That does mean we would not
2036          * account for forwarded tcp traffic.
2037          */
2038         if (nstat_collect) {
2039                 struct rtentry *rt =
2040                     ifnet_cached_rtlookup_inet(inifp, ip->ip_src);
2041                 if (rt != NULL) {
2042                         nstat_route_rx(rt, 1, m->m_pkthdr.len, 0);
2043                         rtfree(rt);
2044                 }
2045         }
2046
2047         /*
2048          * Convert fields to host representation.
2049          */
2050 #if BYTE_ORDER != BIG_ENDIAN
2051         NTOHS(ip->ip_len);
2052 #endif
2053
2054         if (ip->ip_len < hlen) {
2055                 OSAddAtomic(1, &ipstat.ips_badlen);
2056                 goto bad;
2057         }
2058
2059 #if BYTE_ORDER != BIG_ENDIAN
2060         NTOHS(ip->ip_off);
2061 #endif
2062         /*
2063          * Check that the amount of data in the buffers
2064          * is as at least much as the IP header would have us expect.
2065          * Trim mbufs if longer than we expect.
2066          * Drop packet if shorter than we expect.
2067          */
2068         if (m->m_pkthdr.len < ip->ip_len) {
2069 tooshort:
2070                 OSAddAtomic(1, &ipstat.ips_tooshort);
2071                 goto bad;
2072         }
2073         if (m->m_pkthdr.len > ip->ip_len) {
2074                 ip_input_adjust(m, ip, inifp);
2075         }
2076
2077         /* for consistency */
2078         m->m_pkthdr.pkt_proto = ip->ip_p;
2079
2080 #if DUMMYNET
2081 check_with_pf:
2082 #endif
2083 #if PF
2084         /* Invoke inbound packet filter */
2085         if (PF_IS_ENABLED) {
2086                 int error;
2087 #if DUMMYNET
2088                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args);
2089 #else
2090                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
2091 #endif /* DUMMYNET */
2092                 if (error != 0 || m == NULL) {
2093                         if (m != NULL) {
2094                                 panic("%s: unexpected packet %p\n",
2095                                     __func__, m);
2096                                 /* NOTREACHED */
2097                         }
2098                         /* Already freed by callee */
2099                         return;
2100                 }
2101                 ip = mtod(m, struct ip *);
2102                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2103         }
2104 #endif /* PF */
2105
2106 #if IPSEC
2107         if (ipsec_bypass == 0 && ipsec_gethist(m, NULL))
2108                 goto pass;
2109 #endif
2110
2111 #if IPFIREWALL
2112 #if DUMMYNET
2113 iphack:
2114 #endif /* DUMMYNET */
2115         /*
2116          * Check if we want to allow this packet to be processed.
2117          * Consider it to be bad if not.
2118          */
2119         if (fw_enable && IPFW_LOADED) {
2120 #if IPFIREWALL_FORWARD
2121                 /*
2122                  * If we've been forwarded from the output side, then
2123                  * skip the firewall a second time
2124                  */
2125                 if (args.fwa_next_hop)
2126                         goto ours;
2127 #endif  /* IPFIREWALL_FORWARD */
2128
2129                 args.fwa_m = m;
2130
2131                 i = ip_fw_chk_ptr(&args);
2132                 m = args.fwa_m;
2133
2134                 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
2135                         if (m)
2136                                 m_freem(m);
2137                         return;
2138                 }
2139                 ip = mtod(m, struct ip *); /* just in case m changed */
2140
2141                 if (i == 0 && args.fwa_next_hop == NULL) { /* common case */
2142                         goto pass;
2143                 }
2144 #if DUMMYNET
2145                 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
2146                         /* Send packet to the appropriate pipe */
2147                         ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args,
2148                             DN_CLIENT_IPFW);
2149                         return;
2150                 }
2151 #endif /* DUMMYNET */
2152 #if IPDIVERT
2153                 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
2154                         /* Divert or tee packet */
2155                         div_info = i;
2156                         goto ours;
2157                 }
2158 #endif
2159 #if IPFIREWALL_FORWARD
2160                 if (i == 0 && args.fwa_next_hop != NULL) {
2161                         goto pass;
2162                 }
2163 #endif
2164                 /*
2165                  * if we get here, the packet must be dropped
2166                  */
2167                 m_freem(m);
2168                 return;
2169         }
2170 #endif /* IPFIREWALL */
2171 #if IPSEC | IPFIREWALL
2172 pass:
2173 #endif
2174         /*
2175          * Process options and, if not destined for us,
2176          * ship it on.  ip_dooptions returns 1 when an
2177          * error was detected (causing an icmp message
2178          * to be sent and the original packet to be freed).
2179          */
2180         ip_nhops = 0;           /* for source routed packets */
2181 #if IPFIREWALL
2182         if (hlen > sizeof (struct ip) &&
2183             ip_dooptions(m, 0, args.fwa_next_hop)) {
2184 #else /* !IPFIREWALL */
2185         if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) {
2186 #endif /* !IPFIREWALL */
2187                 return;
2188         }
2189
2190         /*
2191          * Check our list of addresses, to see if the packet is for us.
2192          * If we don't have any addresses, assume any unicast packet
2193          * we receive might be for us (and let the upper layers deal
2194          * with it).
2195          */
2196         if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST|M_BCAST))) {
2197                 ip_setdstifaddr_info(m, inifp->if_index, NULL);
2198                 goto ours;
2199         }
2200
2201         /*
2202          * Cache the destination address of the packet; this may be
2203          * changed by use of 'ipfw fwd'.
2204          */
2205 #if IPFIREWALL
2206         pkt_dst = args.fwa_next_hop == NULL ?
2207             ip->ip_dst : args.fwa_next_hop->sin_addr;
2208 #else /* !IPFIREWALL */
2209         pkt_dst = ip->ip_dst;
2210 #endif /* !IPFIREWALL */
2211
2212         /*
2213          * Enable a consistency check between the destination address
2214          * and the arrival interface for a unicast packet (the RFC 1122
2215          * strong ES model) if IP forwarding is disabled and the packet
2216          * is not locally generated and the packet is not subject to
2217          * 'ipfw fwd'.
2218          *
2219          * XXX - Checking also should be disabled if the destination
2220          * address is ipnat'ed to a different interface.
2221          *
2222          * XXX - Checking is incompatible with IP aliases added
2223          * to the loopback interface instead of the interface where
2224          * the packets are received.
2225          */
2226         checkif = ip_checkinterface && (ipforwarding == 0) &&
2227             !(inifp->if_flags & IFF_LOOPBACK) &&
2228             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)
2229 #if IPFIREWALL
2230             && (args.fwa_next_hop == NULL);
2231 #else /* !IPFIREWALL */
2232                 ;
2233 #endif /* !IPFIREWALL */
2234
2235         /*
2236          * Check for exact addresses in the hash bucket.
2237          */
2238         lck_rw_lock_shared(in_ifaddr_rwlock);
2239         TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
2240                 /*
2241                  * If the address matches, verify that the packet
2242                  * arrived via the correct interface if checking is
2243                  * enabled.
2244                  */
2245                 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
2246                     (!checkif || ia->ia_ifp == inifp)) {
2247                         ip_setdstifaddr_info(m, 0, ia);
2248                         lck_rw_done(in_ifaddr_rwlock);
2249                         goto ours;
2250                 }
2251         }
2252         lck_rw_done(in_ifaddr_rwlock);
2253
2254         /*
2255          * Check for broadcast addresses.
2256          *
2257          * Only accept broadcast packets that arrive via the matching
2258          * interface.  Reception of forwarded directed broadcasts would be
2259          * handled via ip_forward() and ether_frameout() with the loopback
2260          * into the stack for SIMPLEX interfaces handled by ether_frameout().
2261          */
2262         if (inifp->if_flags & IFF_BROADCAST) {
2263                 struct ifaddr *ifa;
2264
2265                 ifnet_lock_shared(inifp);
2266                 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
2267                         if (ifa->ifa_addr->sa_family != AF_INET) {
2268                                 continue;
2269                         }
2270                         ia = ifatoia(ifa);
2271                         if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
2272                             pkt_dst.s_addr || ia->ia_netbroadcast.s_addr ==
2273                             pkt_dst.s_addr) {
2274                                 ip_setdstifaddr_info(m, 0, ia);
2275                                 ifnet_lock_done(inifp);
2276                                 goto ours;
2277                         }
2278                 }
2279                 ifnet_lock_done(inifp);
2280         }
2281
2282         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
2283                 struct in_multi *inm;
2284                 /*
2285                  * See if we belong to the destination multicast group on the
2286                  * arrival interface.
2287                  */
2288                 in_multihead_lock_shared();
2289                 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
2290                 in_multihead_lock_done();
2291                 if (inm == NULL) {
2292                         OSAddAtomic(1, &ipstat.ips_notmember);
2293                         m_freem(m);
2294                         return;
2295                 }
2296                 ip_setdstifaddr_info(m, inifp->if_index, NULL);
2297                 INM_REMREF(inm);
2298                 goto ours;
2299         }
2300         if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
2301             ip->ip_dst.s_addr == INADDR_ANY) {
2302                 ip_setdstifaddr_info(m, inifp->if_index, NULL);
2303                 goto ours;
2304         }
2305
2306         /* Allow DHCP/BootP responses through */
2307         if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
2308             hlen == sizeof (struct ip) && ip->ip_p == IPPROTO_UDP) {
2309                 struct udpiphdr *ui;
2310
2311                 if (m->m_len < sizeof (struct udpiphdr) &&
2312                     (m = m_pullup(m, sizeof (struct udpiphdr))) == NULL) {
2313                         OSAddAtomic(1, &udpstat.udps_hdrops);
2314                         return;
2315                 }
2316                 ui = mtod(m, struct udpiphdr *);
2317                 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
2318                         ip_setdstifaddr_info(m, inifp->if_index, NULL);
2319                         goto ours;
2320                 }
2321                 ip = mtod(m, struct ip *); /* in case it changed */
2322         }
2323
2324         /*
2325          * Not for us; forward if possible and desirable.
2326          */
2327         if (ipforwarding == 0) {
2328                 OSAddAtomic(1, &ipstat.ips_cantforward);
2329                 m_freem(m);
2330         } else {
2331 #if IPFIREWALL
2332                 ip_forward(m, 0, args.fwa_next_hop);
2333 #else
2334                 ip_forward(m, 0, NULL);
2335 #endif
2336         }
2337         return;
2338
2339 ours:
2340         /*
2341          * If offset or IP_MF are set, must reassemble.
2342          */
2343         if (ip->ip_off & ~(IP_DF | IP_RF)) {
2344                 /*
2345                  * ip_reass() will return a different mbuf, and update
2346                  * the divert info in div_info and args.fwa_divert_rule.
2347                  */
2348 #if IPDIVERT
2349                 m = ip_reass(m, (u_int16_t *)&div_info, &args.fwa_divert_rule);
2350 #else
2351                 m = ip_reass(m);
2352 #endif
2353                 if (m == NULL)
2354                         return;
2355                 ip = mtod(m, struct ip *);
2356                 /* Get the header length of the reassembled packet */
2357                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2358 #if IPDIVERT
2359                 /* Restore original checksum before diverting packet */
2360                 if (div_info != 0) {
2361 #if BYTE_ORDER != BIG_ENDIAN
2362                         HTONS(ip->ip_len);
2363                         HTONS(ip->ip_off);
2364 #endif
2365                         ip->ip_sum = 0;
2366                         ip->ip_sum = ip_cksum_hdr_in(m, hlen);
2367 #if BYTE_ORDER != BIG_ENDIAN
2368                         NTOHS(ip->ip_off);
2369                         NTOHS(ip->ip_len);
2370 #endif
2371                 }
2372 #endif
2373         }
2374
2375         /*
2376          * Further protocols expect the packet length to be w/o the
2377          * IP header.
2378          */
2379         ip->ip_len -= hlen;
2380
2381 #if IPDIVERT
2382         /*
2383          * Divert or tee packet to the divert protocol if required.
2384          *
2385          * If div_info is zero then cookie should be too, so we shouldn't
2386          * need to clear them here.  Assume divert_packet() does so also.
2387          */
2388         if (div_info != 0) {
2389                 struct mbuf *clone = NULL;
2390
2391                 /* Clone packet if we're doing a 'tee' */
2392                 if (div_info & IP_FW_PORT_TEE_FLAG)
2393                         clone = m_dup(m, M_DONTWAIT);
2394
2395                 /* Restore packet header fields to original values */
2396                 ip->ip_len += hlen;
2397
2398 #if BYTE_ORDER != BIG_ENDIAN
2399                 HTONS(ip->ip_len);
2400                 HTONS(ip->ip_off);
2401 #endif
2402                 /* Deliver packet to divert input routine */
2403                 OSAddAtomic(1, &ipstat.ips_delivered);
2404                 divert_packet(m, 1, div_info & 0xffff, args.fwa_divert_rule);
2405
2406                 /* If 'tee', continue with original packet */
2407                 if (clone == NULL) {
2408                         return;
2409                 }
2410                 m = clone;
2411                 ip = mtod(m, struct ip *);
2412         }
2413 #endif
2414
2415 #if IPSEC
2416         /*
2417          * enforce IPsec policy checking if we are seeing last header.
2418          * note that we do not visit this with protocols with pcb layer
2419          * code - like udp/tcp/raw ip.
2420          */
2421         if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
2422                 if (ipsec4_in_reject(m, NULL)) {
2423                         IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
2424                         goto bad;
2425                 }
2426         }
2427 #endif /* IPSEC */
2428
2429         /*
2430          * Switch out to protocol's input routine.
2431          */
2432         OSAddAtomic(1, &ipstat.ips_delivered);
2433
2434 #if IPFIREWALL
2435         if (args.fwa_next_hop && ip->ip_p == IPPROTO_TCP) {
2436                 /* TCP needs IPFORWARD info if available */
2437                 struct m_tag *fwd_tag;
2438                 struct ip_fwd_tag *ipfwd_tag;
2439
2440                 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
2441                     KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag),
2442                     M_NOWAIT, m);
2443                 if (fwd_tag == NULL)
2444                         goto bad;
2445
2446                 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
2447                 ipfwd_tag->next_hop = args.fwa_next_hop;
2448
2449                 m_tag_prepend(m, fwd_tag);
2450
2451                 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
2452                     ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
2453
2454                 /* TCP deals with its own locking */
2455                 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2456         } else {
2457                 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
2458                     ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
2459
2460                 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) {
2461                         m = tcp_lro(m, hlen);
2462                         if (m == NULL)
2463                                 return;
2464                 }
2465
2466                 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2467         }
2468 #else /* !IPFIREWALL */
2469         if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) {
2470                 m = tcp_lro(m, hlen);
2471                 if (m == NULL)
2472                         return;
2473         }
2474         ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2475 #endif /* !IPFIREWALL */
2476         return;
2477
2478 bad:
2479         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
2480         m_freem(m);
2481 }
2482
2483 static void
2484 ipq_updateparams(void)
2485 {
2486         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2487         /*
2488          * -1 for unlimited allocation.
2489          */
2490         if (maxnipq < 0)
2491                 ipq_limit = 0;
2492         /*
2493          * Positive number for specific bound.
2494          */
2495         if (maxnipq > 0)
2496                 ipq_limit = maxnipq;
2497         /*
2498          * Zero specifies no further fragment queue allocation -- set the
2499          * bound very low, but rely on implementation elsewhere to actually
2500          * prevent allocation and reclaim current queues.
2501          */
2502         if (maxnipq == 0)
2503                 ipq_limit = 1;
2504         /*
2505          * Arm the purge timer if not already and if there's work to do
2506          */
2507         frag_sched_timeout();
2508 }
2509
2510 static int
2511 sysctl_maxnipq SYSCTL_HANDLER_ARGS
2512 {
2513 #pragma unused(arg1, arg2)
2514         int error, i;
2515
2516         lck_mtx_lock(&ipqlock);
2517         i = maxnipq;
2518         error = sysctl_handle_int(oidp, &i, 0, req);
2519         if (error || req->newptr == USER_ADDR_NULL)
2520                 goto done;
2521         /* impose bounds */
2522         if (i < -1 || i > (nmbclusters / 4)) {
2523                 error = EINVAL;
2524                 goto done;
2525         }
2526         maxnipq = i;
2527         ipq_updateparams();
2528 done:
2529         lck_mtx_unlock(&ipqlock);
2530         return (error);
2531 }
2532
2533 static int
2534 sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS
2535 {
2536 #pragma unused(arg1, arg2)
2537         int error, i;
2538
2539         lck_mtx_lock(&ipqlock);
2540         i = maxfragsperpacket;
2541         error = sysctl_handle_int(oidp, &i, 0, req);
2542         if (error || req->newptr == USER_ADDR_NULL)
2543                 goto done;
2544         maxfragsperpacket = i;
2545         ipq_updateparams();     /* see if we need to arm timer */
2546 done:
2547         lck_mtx_unlock(&ipqlock);
2548         return (error);
2549 }
2550
2551 /*
2552  * Take incoming datagram fragment and try to reassemble it into
2553  * whole datagram.  If a chain for reassembly of this datagram already
2554  * exists, then it is given as fp; otherwise have to make a chain.
2555  *
2556  * When IPDIVERT enabled, keep additional state with each packet that
2557  * tells us if we need to divert or tee the packet we're building.
2558  *
2559  * The IP header is *NOT* adjusted out of iplen (but in host byte order).
2560  */
2561 static struct mbuf *
2562 #if IPDIVERT
2563 ip_reass(struct mbuf *m,
2564 #ifdef IPDIVERT_44
2565     u_int32_t *divinfo,
2566 #else /* IPDIVERT_44 */
2567     u_int16_t *divinfo,
2568 #endif /* IPDIVERT_44 */
2569     u_int16_t *divcookie)
2570 #else /* IPDIVERT */
2571 ip_reass(struct mbuf *m)
2572 #endif /* IPDIVERT */
2573 {
2574         struct ip *ip;
2575         struct mbuf *p, *q, *nq, *t;
2576         struct ipq *fp = NULL;
2577         struct ipqhead *head;
2578         int i, hlen, next;
2579         u_int8_t ecn, ecn0;
2580         uint32_t csum, csum_flags;
2581         uint16_t hash;
2582         struct fq_head dfq;
2583
2584         MBUFQ_INIT(&dfq);       /* for deferred frees */
2585
2586         /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */
2587         if (maxnipq == 0 || maxfragsperpacket == 0) {
2588                 ipstat.ips_fragments++;
2589                 ipstat.ips_fragdropped++;
2590                 m_freem(m);
2591                 if (nipq > 0) {
2592                         lck_mtx_lock(&ipqlock);
2593                         frag_sched_timeout();   /* purge stale fragments */
2594                         lck_mtx_unlock(&ipqlock);
2595                 }
2596                 return (NULL);
2597         }
2598
2599         ip = mtod(m, struct ip *);
2600         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2601
2602         lck_mtx_lock(&ipqlock);
2603
2604         hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
2605         head = &ipq[hash];
2606
2607         /*
2608          * Look for queue of fragments
2609          * of this datagram.
2610          */
2611         TAILQ_FOREACH(fp, head, ipq_list) {
2612                 if (ip->ip_id == fp->ipq_id &&
2613                     ip->ip_src.s_addr == fp->ipq_src.s_addr &&
2614                     ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
2615 #if CONFIG_MACF_NET
2616                     mac_ipq_label_compare(m, fp) &&
2617 #endif
2618                     ip->ip_p == fp->ipq_p)
2619                         goto found;
2620         }
2621
2622         fp = NULL;
2623
2624         /*
2625          * Attempt to trim the number of allocated fragment queues if it
2626          * exceeds the administrative limit.
2627          */
2628         if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) {
2629                 /*
2630                  * drop something from the tail of the current queue
2631                  * before proceeding further
2632                  */
2633                 struct ipq *fq = TAILQ_LAST(head, ipqhead);
2634                 if (fq == NULL) {   /* gak */
2635                         for (i = 0; i < IPREASS_NHASH; i++) {
2636                                 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
2637                                 if (r) {
2638                                         ipstat.ips_fragtimeout += r->ipq_nfrags;
2639                                         frag_freef(&ipq[i], r);
2640                                         break;
2641                                 }
2642                         }
2643                 } else {
2644                         ipstat.ips_fragtimeout += fq->ipq_nfrags;
2645                         frag_freef(head, fq);
2646                 }
2647         }
2648
2649 found:
2650         /*
2651          * Leverage partial checksum offload for IP fragments.  Narrow down
2652          * the scope to cover only UDP without IP options, as that is the
2653          * most common case.
2654          *
2655          * Perform 1's complement adjustment of octets that got included/
2656          * excluded in the hardware-calculated checksum value.  Ignore cases
2657          * where the value includes the entire IPv4 header span, as the sum
2658          * for those octets would already be 0 by the time we get here; IP
2659          * has already performed its header checksum validation.  Also take
2660          * care of any trailing bytes and subtract out their partial sum.
2661          */
2662         if (ip->ip_p == IPPROTO_UDP && hlen == sizeof (struct ip) &&
2663             (m->m_pkthdr.csum_flags &
2664             (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
2665             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
2666                 uint32_t start = m->m_pkthdr.csum_rx_start;
2667                 int32_t trailer = (m_pktlen(m) - ip->ip_len);
2668                 uint32_t swbytes = (uint32_t)trailer;
2669
2670                 csum = m->m_pkthdr.csum_rx_val;
2671
2672                 ASSERT(trailer >= 0);
2673                 if ((start != 0 && start != hlen) || trailer != 0) {
2674 #if BYTE_ORDER != BIG_ENDIAN
2675                         if (start < hlen) {
2676                                 HTONS(ip->ip_len);
2677                                 HTONS(ip->ip_off);
2678                         }
2679 #endif /* BYTE_ORDER != BIG_ENDIAN */
2680                         /* callee folds in sum */
2681                         csum = m_adj_sum16(m, start, hlen,
2682                             (ip->ip_len - hlen), csum);
2683                         if (hlen > start)
2684                                 swbytes += (hlen - start);
2685                         else
2686                                 swbytes += (start - hlen);
2687 #if BYTE_ORDER != BIG_ENDIAN
2688                         if (start < hlen) {
2689                                 NTOHS(ip->ip_off);
2690                                 NTOHS(ip->ip_len);
2691                         }
2692 #endif /* BYTE_ORDER != BIG_ENDIAN */
2693                 }
2694                 csum_flags = m->m_pkthdr.csum_flags;
2695
2696                 if (swbytes != 0)
2697                         udp_in_cksum_stats(swbytes);
2698                 if (trailer != 0)
2699                         m_adj(m, -trailer);
2700         } else {
2701                 csum = 0;
2702                 csum_flags = 0;
2703         }
2704
2705         /* Invalidate checksum */
2706         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
2707
2708         ipstat.ips_fragments++;
2709
2710         /*
2711          * Adjust ip_len to not reflect header,
2712          * convert offset of this to bytes.
2713          */
2714         ip->ip_len -= hlen;
2715         if (ip->ip_off & IP_MF) {
2716                 /*
2717                  * Make sure that fragments have a data length
2718                  * that's a non-zero multiple of 8 bytes.
2719                  */
2720                 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
2721                         OSAddAtomic(1, &ipstat.ips_toosmall);
2722                         /*
2723                          * Reassembly queue may have been found if previous
2724                          * fragments were valid; given that this one is bad,
2725                          * we need to drop it.  Make sure to set fp to NULL
2726                          * if not already, since we don't want to decrement
2727                          * ipq_nfrags as it doesn't include this packet.
2728                          */
2729                         fp = NULL;
2730                         goto dropfrag;
2731                 }
2732                 m->m_flags |= M_FRAG;
2733         } else {
2734                 /* Clear the flag in case packet comes from loopback */
2735                 m->m_flags &= ~M_FRAG;
2736         }
2737         ip->ip_off <<= 3;
2738
2739         m->m_pkthdr.pkt_hdr = ip;
2740
2741         /* Previous ip_reass() started here. */
2742         /*
2743          * Presence of header sizes in mbufs
2744          * would confuse code below.
2745          */
2746         m->m_data += hlen;
2747         m->m_len -= hlen;
2748
2749         /*
2750          * If first fragment to arrive, create a reassembly queue.
2751          */
2752         if (fp == NULL) {
2753                 fp = ipq_alloc(M_DONTWAIT);
2754                 if (fp == NULL)
2755                         goto dropfrag;
2756 #if CONFIG_MACF_NET
2757                 if (mac_ipq_label_init(fp, M_NOWAIT) != 0) {
2758                         ipq_free(fp);
2759                         fp = NULL;
2760                         goto dropfrag;
2761                 }
2762                 mac_ipq_label_associate(m, fp);
2763 #endif
2764                 TAILQ_INSERT_HEAD(head, fp, ipq_list);
2765                 nipq++;
2766                 fp->ipq_nfrags = 1;
2767                 fp->ipq_ttl = IPFRAGTTL;
2768                 fp->ipq_p = ip->ip_p;
2769                 fp->ipq_id = ip->ip_id;
2770                 fp->ipq_src = ip->ip_src;
2771                 fp->ipq_dst = ip->ip_dst;
2772                 fp->ipq_frags = m;
2773                 m->m_nextpkt = NULL;
2774                 /*
2775                  * If the first fragment has valid checksum offload
2776                  * info, the rest of fragments are eligible as well.
2777                  */
2778                 if (csum_flags != 0) {
2779                         fp->ipq_csum = csum;
2780                         fp->ipq_csum_flags = csum_flags;
2781                 }
2782 #if IPDIVERT
2783                 /*
2784                  * Transfer firewall instructions to the fragment structure.
2785                  * Only trust info in the fragment at offset 0.
2786                  */
2787                 if (ip->ip_off == 0) {
2788 #ifdef IPDIVERT_44
2789                         fp->ipq_div_info = *divinfo;
2790 #else
2791                         fp->ipq_divert = *divinfo;
2792 #endif
2793                         fp->ipq_div_cookie = *divcookie;
2794                 }
2795                 *divinfo = 0;
2796                 *divcookie = 0;
2797 #endif /* IPDIVERT */
2798                 m = NULL;       /* nothing to return */
2799                 goto done;
2800         } else {
2801                 fp->ipq_nfrags++;
2802 #if CONFIG_MACF_NET
2803                 mac_ipq_label_update(m, fp);
2804 #endif
2805         }
2806
2807 #define GETIP(m)        ((struct ip *)((m)->m_pkthdr.pkt_hdr))
2808
2809         /*
2810          * Handle ECN by comparing this segment with the first one;
2811          * if CE is set, do not lose CE.
2812          * drop if CE and not-ECT are mixed for the same packet.
2813          */
2814         ecn = ip->ip_tos & IPTOS_ECN_MASK;
2815         ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
2816         if (ecn == IPTOS_ECN_CE) {
2817                 if (ecn0 == IPTOS_ECN_NOTECT)
2818                         goto dropfrag;
2819                 if (ecn0 != IPTOS_ECN_CE)
2820                         GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
2821         }
2822         if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
2823                 goto dropfrag;
2824
2825         /*
2826          * Find a segment which begins after this one does.
2827          */
2828         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
2829                 if (GETIP(q)->ip_off > ip->ip_off)
2830                         break;
2831
2832         /*
2833          * If there is a preceding segment, it may provide some of
2834          * our data already.  If so, drop the data from the incoming
2835          * segment.  If it provides all of our data, drop us, otherwise
2836          * stick new segment in the proper place.
2837          *
2838          * If some of the data is dropped from the preceding
2839          * segment, then it's checksum is invalidated.
2840          */
2841         if (p) {
2842                 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
2843                 if (i > 0) {
2844                         if (i >= ip->ip_len)
2845                                 goto dropfrag;
2846                         m_adj(m, i);
2847                         fp->ipq_csum_flags = 0;
2848                         ip->ip_off += i;
2849                         ip->ip_len -= i;
2850                 }
2851                 m->m_nextpkt = p->m_nextpkt;
2852                 p->m_nextpkt = m;
2853         } else {
2854                 m->m_nextpkt = fp->ipq_frags;
2855                 fp->ipq_frags = m;
2856         }
2857
2858         /*
2859          * While we overlap succeeding segments trim them or,
2860          * if they are completely covered, dequeue them.
2861          */
2862         for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
2863             q = nq) {
2864                 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
2865                 if (i < GETIP(q)->ip_len) {
2866                         GETIP(q)->ip_len -= i;
2867                         GETIP(q)->ip_off += i;
2868                         m_adj(q, i);
2869                         fp->ipq_csum_flags = 0;
2870                         break;
2871                 }
2872                 nq = q->m_nextpkt;
2873                 m->m_nextpkt = nq;
2874                 ipstat.ips_fragdropped++;
2875                 fp->ipq_nfrags--;
2876                 /* defer freeing until after lock is dropped */
2877                 MBUFQ_ENQUEUE(&dfq, q);
2878         }
2879
2880         /*
2881          * If this fragment contains similar checksum offload info
2882          * as that of the existing ones, accumulate checksum.  Otherwise,
2883          * invalidate checksum offload info for the entire datagram.
2884          */
2885         if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags)
2886                 fp->ipq_csum += csum;
2887         else if (fp->ipq_csum_flags != 0)
2888                 fp->ipq_csum_flags = 0;
2889
2890 #if IPDIVERT
2891         /*
2892          * Transfer firewall instructions to the fragment structure.
2893          * Only trust info in the fragment at offset 0.
2894          */
2895         if (ip->ip_off == 0) {
2896 #ifdef IPDIVERT_44
2897                 fp->ipq_div_info = *divinfo;
2898 #else
2899                 fp->ipq_divert = *divinfo;
2900 #endif
2901                 fp->ipq_div_cookie = *divcookie;
2902         }
2903         *divinfo = 0;
2904         *divcookie = 0;
2905 #endif /* IPDIVERT */
2906
2907         /*
2908          * Check for complete reassembly and perform frag per packet
2909          * limiting.
2910          *
2911          * Frag limiting is performed here so that the nth frag has
2912          * a chance to complete the packet before we drop the packet.
2913          * As a result, n+1 frags are actually allowed per packet, but
2914          * only n will ever be stored. (n = maxfragsperpacket.)
2915          *
2916          */
2917         next = 0;
2918         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2919                 if (GETIP(q)->ip_off != next) {
2920                         if (fp->ipq_nfrags > maxfragsperpacket) {
2921                                 ipstat.ips_fragdropped += fp->ipq_nfrags;
2922                                 frag_freef(head, fp);
2923                         }
2924                         m = NULL;       /* nothing to return */
2925                         goto done;
2926                 }
2927                 next += GETIP(q)->ip_len;
2928         }
2929         /* Make sure the last packet didn't have the IP_MF flag */
2930         if (p->m_flags & M_FRAG) {
2931                 if (fp->ipq_nfrags > maxfragsperpacket) {
2932                         ipstat.ips_fragdropped += fp->ipq_nfrags;
2933                         frag_freef(head, fp);
2934                 }
2935                 m = NULL;               /* nothing to return */
2936                 goto done;
2937         }
2938
2939         /*
2940          * Reassembly is complete.  Make sure the packet is a sane size.
2941          */
2942         q = fp->ipq_frags;
2943         ip = GETIP(q);
2944         if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
2945                 ipstat.ips_toolong++;
2946                 ipstat.ips_fragdropped += fp->ipq_nfrags;
2947                 frag_freef(head, fp);
2948                 m = NULL;               /* nothing to return */
2949                 goto done;
2950         }
2951
2952         /*
2953          * Concatenate fragments.
2954          */
2955         m = q;
2956         t = m->m_next;
2957         m->m_next = NULL;
2958         m_cat(m, t);
2959         nq = q->m_nextpkt;
2960         q->m_nextpkt = NULL;
2961         for (q = nq; q != NULL; q = nq) {
2962                 nq = q->m_nextpkt;
2963                 q->m_nextpkt = NULL;
2964                 m_cat(m, q);
2965         }
2966
2967         /*
2968          * Store partial hardware checksum info from the fragment queue;
2969          * the receive start offset is set to 20 bytes (see code at the
2970          * top of this routine.)
2971          */
2972         if (fp->ipq_csum_flags != 0) {
2973                 csum = fp->ipq_csum;
2974
2975                 ADDCARRY(csum);
2976
2977                 m->m_pkthdr.csum_rx_val = csum;
2978                 m->m_pkthdr.csum_rx_start = sizeof (struct ip);
2979                 m->m_pkthdr.csum_flags = fp->ipq_csum_flags;
2980         } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
2981             (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2982                 /* loopback checksums are always OK */
2983                 m->m_pkthdr.csum_data = 0xffff;
2984                 m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
2985                 m->m_pkthdr.csum_flags =
2986                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
2987                     CSUM_IP_CHECKED | CSUM_IP_VALID;
2988         }
2989
2990 #if IPDIVERT
2991         /*
2992          * Extract firewall instructions from the fragment structure.
2993          */
2994 #ifdef IPDIVERT_44
2995         *divinfo = fp->ipq_div_info;
2996 #else
2997         *divinfo = fp->ipq_divert;
2998 #endif
2999         *divcookie = fp->ipq_div_cookie;
3000 #endif /* IPDIVERT */
3001
3002 #if CONFIG_MACF_NET
3003         mac_mbuf_label_associate_ipq(fp, m);
3004         mac_ipq_label_destroy(fp);
3005 #endif
3006         /*
3007          * Create header for new ip packet by modifying header of first
3008          * packet; dequeue and discard fragment reassembly header.
3009          * Make header visible.
3010          */
3011         ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next;
3012         ip->ip_src = fp->ipq_src;
3013         ip->ip_dst = fp->ipq_dst;
3014
3015         fp->ipq_frags = NULL;   /* return to caller as 'm' */
3016         frag_freef(head, fp);
3017         fp = NULL;
3018
3019         m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
3020         m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
3021         /* some debugging cruft by sklower, below, will go away soon */
3022         if (m->m_flags & M_PKTHDR)      /* XXX this should be done elsewhere */
3023                 m_fixhdr(m);
3024         ipstat.ips_reassembled++;
3025
3026         /* arm the purge timer if not already and if there's work to do */
3027         frag_sched_timeout();
3028         lck_mtx_unlock(&ipqlock);
3029         /* perform deferred free (if needed) now that lock is dropped */
3030         if (!MBUFQ_EMPTY(&dfq))
3031                 MBUFQ_DRAIN(&dfq);
3032         VERIFY(MBUFQ_EMPTY(&dfq));
3033         return (m);
3034
3035 done:
3036         VERIFY(m == NULL);
3037         /* arm the purge timer if not already and if there's work to do */
3038         frag_sched_timeout();
3039         lck_mtx_unlock(&ipqlock);
3040         /* perform deferred free (if needed) */
3041         if (!MBUFQ_EMPTY(&dfq))
3042                 MBUFQ_DRAIN(&dfq);
3043         VERIFY(MBUFQ_EMPTY(&dfq));
3044         return (NULL);
3045
3046 dropfrag:
3047 #if IPDIVERT
3048         *divinfo = 0;
3049         *divcookie = 0;
3050 #endif /* IPDIVERT */
3051         ipstat.ips_fragdropped++;
3052         if (fp != NULL)
3053                 fp->ipq_nfrags--;
3054         /* arm the purge timer if not already and if there's work to do */
3055         frag_sched_timeout();
3056         lck_mtx_unlock(&ipqlock);
3057         m_freem(m);
3058         /* perform deferred free (if needed) */
3059         if (!MBUFQ_EMPTY(&dfq))
3060                 MBUFQ_DRAIN(&dfq);
3061         VERIFY(MBUFQ_EMPTY(&dfq));
3062         return (NULL);
3063 #undef GETIP
3064 }
3065
3066 /*
3067  * Free a fragment reassembly header and all
3068  * associated datagrams.
3069  */
3070 static void
3071 frag_freef(struct ipqhead *fhp, struct ipq *fp)
3072 {
3073         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
3074
3075         fp->ipq_nfrags = 0;
3076         if (fp->ipq_frags != NULL) {
3077                 m_freem_list(fp->ipq_frags);
3078                 fp->ipq_frags = NULL;
3079         }
3080         TAILQ_REMOVE(fhp, fp, ipq_list);
3081         nipq--;
3082         ipq_free(fp);
3083 }
3084
3085 /*
3086  * IP reassembly timer processing
3087  */
3088 static void
3089 frag_timeout(void *arg)
3090 {
3091 #pragma unused(arg)
3092         struct ipq *fp;
3093         int i;
3094
3095         /*
3096          * Update coarse-grained networking timestamp (in sec.); the idea
3097          * is to piggy-back on the timeout callout to update the counter
3098          * returnable via net_uptime().
3099          */
3100         net_update_uptime();
3101
3102         lck_mtx_lock(&ipqlock);
3103         for (i = 0; i < IPREASS_NHASH; i++) {
3104                 for (fp = TAILQ_FIRST(&ipq[i]); fp; ) {
3105                         struct ipq *fpp;
3106
3107                         fpp = fp;
3108                         fp = TAILQ_NEXT(fp, ipq_list);
3109                         if (--fpp->ipq_ttl == 0) {
3110                                 ipstat.ips_fragtimeout += fpp->ipq_nfrags;
3111                                 frag_freef(&ipq[i], fpp);
3112                         }
3113                 }
3114         }
3115         /*
3116          * If we are over the maximum number of fragments
3117          * (due to the limit being lowered), drain off
3118          * enough to get down to the new limit.
3119          */
3120         if (maxnipq >= 0 && nipq > (unsigned)maxnipq) {
3121                 for (i = 0; i < IPREASS_NHASH; i++) {
3122                         while (nipq > (unsigned)maxnipq &&
3123                             !TAILQ_EMPTY(&ipq[i])) {
3124                                 ipstat.ips_fragdropped +=
3125                                     TAILQ_FIRST(&ipq[i])->ipq_nfrags;
3126                                 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
3127                         }
3128                 }
3129         }
3130         /* re-arm the purge timer if there's work to do */
3131         frag_timeout_run = 0;
3132         frag_sched_timeout();
3133         lck_mtx_unlock(&ipqlock);
3134 }
3135
3136 static void
3137 frag_sched_timeout(void)
3138 {
3139         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
3140
3141         if (!frag_timeout_run && nipq > 0) {
3142                 frag_timeout_run = 1;
3143                 timeout(frag_timeout, NULL, hz);
3144         }
3145 }
3146
3147 /*
3148  * Drain off all datagram fragments.
3149  */
3150 static void
3151 frag_drain(void)
3152 {
3153         int i;
3154
3155         lck_mtx_lock(&ipqlock);
3156         for (i = 0; i < IPREASS_NHASH; i++) {
3157                 while (!TAILQ_EMPTY(&ipq[i])) {
3158                         ipstat.ips_fragdropped +=
3159                             TAILQ_FIRST(&ipq[i])->ipq_nfrags;
3160                         frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
3161                 }
3162         }
3163         lck_mtx_unlock(&ipqlock);
3164 }
3165
3166 static struct ipq *
3167 ipq_alloc(int how)
3168 {
3169         struct mbuf *t;
3170         struct ipq *fp;
3171
3172         /*
3173          * See comments in ipq_updateparams().  Keep the count separate
3174          * from nipq since the latter represents the elements already
3175          * in the reassembly queues.
3176          */
3177         if (ipq_limit > 0 && ipq_count > ipq_limit)
3178                 return (NULL);
3179
3180         t = m_get(how, MT_FTABLE);
3181         if (t != NULL) {
3182                 atomic_add_32(&ipq_count, 1);
3183                 fp = mtod(t, struct ipq *);
3184                 bzero(fp, sizeof (*fp));
3185         } else {
3186                 fp = NULL;
3187         }
3188         return (fp);
3189 }
3190
3191 static void
3192 ipq_free(struct ipq *fp)
3193 {
3194         (void) m_free(dtom(fp));
3195         atomic_add_32(&ipq_count, -1);
3196 }
3197
3198 /*
3199  * Drain callback
3200  */
3201 void
3202 ip_drain(void)
3203 {
3204         frag_drain();           /* fragments */
3205         in_rtqdrain();          /* protocol cloned routes */
3206         in_arpdrain(NULL);      /* cloned routes: ARP */
3207 }
3208
3209 /*
3210  * Do option processing on a datagram,
3211  * possibly discarding it if bad options are encountered,
3212  * or forwarding it if source-routed.
3213  * The pass argument is used when operating in the IPSTEALTH
3214  * mode to tell what options to process:
3215  * [LS]SRR (pass 0) or the others (pass 1).
3216  * The reason for as many as two passes is that when doing IPSTEALTH,
3217  * non-routing options should be processed only if the packet is for us.
3218  * Returns 1 if packet has been forwarded/freed,
3219  * 0 if the packet should be processed further.
3220  */
3221 static int
3222 ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
3223 {
3224 #pragma unused(pass)
3225         struct ip *ip = mtod(m, struct ip *);
3226         u_char *cp;
3227         struct ip_timestamp *ipt;
3228         struct in_ifaddr *ia;
3229         int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
3230         struct in_addr *sin, dst;
3231         u_int32_t ntime;
3232         struct sockaddr_in ipaddr = {
3233             sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } };
3234
3235         /* Expect 32-bit aligned data pointer on strict-align platforms */
3236         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3237
3238         dst = ip->ip_dst;
3239         cp = (u_char *)(ip + 1);
3240         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
3241         for (; cnt > 0; cnt -= optlen, cp += optlen) {
3242                 opt = cp[IPOPT_OPTVAL];
3243                 if (opt == IPOPT_EOL)
3244                         break;
3245                 if (opt == IPOPT_NOP)
3246                         optlen = 1;
3247                 else {
3248                         if (cnt < IPOPT_OLEN + sizeof (*cp)) {
3249                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
3250                                 goto bad;
3251                         }
3252                         optlen = cp[IPOPT_OLEN];
3253                         if (optlen < IPOPT_OLEN + sizeof (*cp) ||
3254                             optlen > cnt) {
3255                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
3256                                 goto bad;
3257                         }
3258                 }
3259                 switch (opt) {
3260
3261                 default:
3262                         break;
3263
3264                 /*
3265                  * Source routing with record.
3266                  * Find interface with current destination address.
3267                  * If none on this machine then drop if strictly routed,
3268                  * or do nothing if loosely routed.
3269                  * Record interface address and bring up next address
3270                  * component.  If strictly routed make sure next
3271                  * address is on directly accessible net.
3272                  */
3273                 case IPOPT_LSRR:
3274                 case IPOPT_SSRR:
3275                         if (optlen < IPOPT_OFFSET + sizeof (*cp)) {
3276                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
3277                                 goto bad;
3278                         }
3279                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
3280                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3281                                 goto bad;
3282                         }
3283                         ipaddr.sin_addr = ip->ip_dst;
3284                         ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr));
3285                         if (ia == NULL) {
3286                                 if (opt == IPOPT_SSRR) {
3287                                         type = ICMP_UNREACH;
3288                                         code = ICMP_UNREACH_SRCFAIL;
3289                                         goto bad;
3290                                 }
3291                                 if (!ip_dosourceroute)
3292                                         goto nosourcerouting;
3293                                 /*
3294                                  * Loose routing, and not at next destination
3295                                  * yet; nothing to do except forward.
3296                                  */
3297                                 break;
3298                         } else {
3299                                 IFA_REMREF(&ia->ia_ifa);
3300                                 ia = NULL;
3301                         }
3302                         off--;                  /* 0 origin */
3303                         if (off > optlen - (int)sizeof (struct in_addr)) {
3304                                 /*
3305                                  * End of source route.  Should be for us.
3306                                  */
3307                                 if (!ip_acceptsourceroute)
3308                                         goto nosourcerouting;
3309                                 save_rte(cp, ip->ip_src);
3310                                 break;
3311                         }
3312
3313                         if (!ip_dosourceroute) {
3314                                 if (ipforwarding) {
3315                                         char buf[MAX_IPv4_STR_LEN];
3316                                         char buf2[MAX_IPv4_STR_LEN];
3317                                         /*
3318                                          * Acting as a router, so generate ICMP
3319                                          */
3320 nosourcerouting:
3321                                         log(LOG_WARNING,
3322                                             "attempted source route from %s "
3323                                             "to %s\n",
3324                                             inet_ntop(AF_INET, &ip->ip_src,
3325                                             buf, sizeof (buf)),
3326                                             inet_ntop(AF_INET, &ip->ip_dst,
3327                                             buf2, sizeof (buf2)));
3328                                         type = ICMP_UNREACH;
3329                                         code = ICMP_UNREACH_SRCFAIL;
3330                                         goto bad;
3331                                 } else {
3332                                         /*
3333                                          * Not acting as a router,
3334                                          * so silently drop.
3335                                          */
3336                                         OSAddAtomic(1, &ipstat.ips_cantforward);
3337                                         m_freem(m);
3338                                         return (1);
3339                                 }
3340                         }
3341
3342                         /*
3343                          * locate outgoing interface
3344                          */
3345                         (void) memcpy(&ipaddr.sin_addr, cp + off,
3346                             sizeof (ipaddr.sin_addr));
3347
3348                         if (opt == IPOPT_SSRR) {
3349 #define INA     struct in_ifaddr *
3350                                 if ((ia = (INA)ifa_ifwithdstaddr(
3351                                     SA(&ipaddr))) == NULL) {
3352                                         ia = (INA)ifa_ifwithnet(SA(&ipaddr));
3353                                 }
3354                         } else {
3355                                 ia = ip_rtaddr(ipaddr.sin_addr);
3356                         }
3357                         if (ia == NULL) {
3358                                 type = ICMP_UNREACH;
3359                                 code = ICMP_UNREACH_SRCFAIL;
3360                                 goto bad;
3361                         }
3362                         ip->ip_dst = ipaddr.sin_addr;
3363                         IFA_LOCK(&ia->ia_ifa);
3364                         (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
3365                             sizeof (struct in_addr));
3366                         IFA_UNLOCK(&ia->ia_ifa);
3367                         IFA_REMREF(&ia->ia_ifa);
3368                         ia = NULL;
3369                         cp[IPOPT_OFFSET] += sizeof (struct in_addr);
3370                         /*
3371                          * Let ip_intr's mcast routing check handle mcast pkts
3372                          */
3373                         forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
3374                         break;
3375
3376                 case IPOPT_RR:
3377                         if (optlen < IPOPT_OFFSET + sizeof (*cp)) {
3378                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3379                                 goto bad;
3380                         }
3381                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
3382                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3383                                 goto bad;
3384                         }
3385                         /*
3386                          * If no space remains, ignore.
3387                          */
3388                         off--;                  /* 0 origin */
3389                         if (off > optlen - (int)sizeof (struct in_addr))
3390                                 break;
3391                         (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst,
3392                             sizeof (ipaddr.sin_addr));
3393                         /*
3394                          * locate outgoing interface; if we're the destination,
3395                          * use the incoming interface (should be same).
3396                          */
3397                         if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) {
3398                                 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
3399                                         type = ICMP_UNREACH;
3400                                         code = ICMP_UNREACH_HOST;
3401                                         goto bad;
3402                                 }
3403                         }
3404                         IFA_LOCK(&ia->ia_ifa);
3405                         (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
3406                             sizeof (struct in_addr));
3407                         IFA_UNLOCK(&ia->ia_ifa);
3408                         IFA_REMREF(&ia->ia_ifa);
3409                         ia = NULL;
3410                         cp[IPOPT_OFFSET] += sizeof (struct in_addr);
3411                         break;
3412
3413                 case IPOPT_TS:
3414                         code = cp - (u_char *)ip;
3415                         ipt = (struct ip_timestamp *)(void *)cp;
3416                         if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
3417                                 code = (u_char *)&ipt->ipt_len - (u_char *)ip;
3418                                 goto bad;
3419                         }
3420                         if (ipt->ipt_ptr < 5) {
3421                                 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
3422                                 goto bad;
3423                         }
3424                         if (ipt->ipt_ptr >
3425                             ipt->ipt_len - (int)sizeof (int32_t)) {
3426                                 if (++ipt->ipt_oflw == 0) {
3427                                         code = (u_char *)&ipt->ipt_ptr -
3428                                             (u_char *)ip;
3429                                         goto bad;
3430                                 }
3431                                 break;
3432                         }
3433                         sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1);
3434                         switch (ipt->ipt_flg) {
3435
3436                         case IPOPT_TS_TSONLY:
3437                                 break;
3438
3439                         case IPOPT_TS_TSANDADDR:
3440                                 if (ipt->ipt_ptr - 1 + sizeof (n_time) +
3441                                     sizeof (struct in_addr) > ipt->ipt_len) {
3442                                         code = (u_char *)&ipt->ipt_ptr -
3443                                             (u_char *)ip;
3444                                         goto bad;
3445                                 }
3446                                 ipaddr.sin_addr = dst;
3447                                 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr),
3448                                     m->m_pkthdr.rcvif);
3449                                 if (ia == NULL)
3450                                         continue;
3451                                 IFA_LOCK(&ia->ia_ifa);
3452                                 (void) memcpy(sin, &IA_SIN(ia)->sin_addr,
3453                                     sizeof (struct in_addr));
3454                                 IFA_UNLOCK(&ia->ia_ifa);
3455                                 ipt->ipt_ptr += sizeof (struct in_addr);
3456                                 IFA_REMREF(&ia->ia_ifa);
3457                                 ia = NULL;
3458                                 break;
3459
3460                         case IPOPT_TS_PRESPEC:
3461                                 if (ipt->ipt_ptr - 1 + sizeof (n_time) +
3462                                     sizeof (struct in_addr) > ipt->ipt_len) {
3463                                         code = (u_char *)&ipt->ipt_ptr -
3464                                             (u_char *)ip;
3465                                         goto bad;
3466                                 }
3467                                 (void) memcpy(&ipaddr.sin_addr, sin,
3468                                     sizeof (struct in_addr));
3469                                 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr(
3470                                     SA(&ipaddr))) == NULL)
3471                                         continue;
3472                                 IFA_REMREF(&ia->ia_ifa);
3473                                 ia = NULL;
3474                                 ipt->ipt_ptr += sizeof (struct in_addr);
3475                                 break;
3476
3477                         default:
3478                                 /* XXX can't take &ipt->ipt_flg */
3479                                 code = (u_char *)&ipt->ipt_ptr -
3480                                     (u_char *)ip + 1;
3481                                 goto bad;
3482                         }
3483                         ntime = iptime();
3484                         (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime,
3485                             sizeof (n_time));
3486                         ipt->ipt_ptr += sizeof (n_time);
3487                 }
3488         }
3489         if (forward && ipforwarding) {
3490                 ip_forward(m, 1, next_hop);
3491                 return (1);
3492         }
3493         return (0);
3494 bad:
3495         icmp_error(m, type, code, 0, 0);
3496         OSAddAtomic(1, &ipstat.ips_badoptions);
3497         return (1);
3498 }
3499
3500 /*
3501  * Check for the presence of the IP Router Alert option [RFC2113]
3502  * in the header of an IPv4 datagram.
3503  *
3504  * This call is not intended for use from the forwarding path; it is here
3505  * so that protocol domains may check for the presence of the option.
3506  * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
3507  * option does not have much relevance to the implementation, though this
3508  * may change in future.
3509  * Router alert options SHOULD be passed if running in IPSTEALTH mode and
3510  * we are not the endpoint.
3511  * Length checks on individual options should already have been peformed
3512  * by ip_dooptions() therefore they are folded under DIAGNOSTIC here.
3513  *
3514  * Return zero if not present or options are invalid, non-zero if present.
3515  */
3516 int
3517 ip_checkrouteralert(struct mbuf *m)
3518 {
3519         struct ip *ip = mtod(m, struct ip *);
3520         u_char *cp;
3521         int opt, optlen, cnt, found_ra;
3522
3523         found_ra = 0;
3524         cp = (u_char *)(ip + 1);
3525         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
3526         for (; cnt > 0; cnt -= optlen, cp += optlen) {
3527                 opt = cp[IPOPT_OPTVAL];
3528                 if (opt == IPOPT_EOL)
3529                         break;
3530                 if (opt == IPOPT_NOP)
3531                         optlen = 1;
3532                 else {
3533 #ifdef DIAGNOSTIC
3534                         if (cnt < IPOPT_OLEN + sizeof (*cp))
3535                                 break;
3536 #endif
3537                         optlen = cp[IPOPT_OLEN];
3538 #ifdef DIAGNOSTIC
3539                         if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt)
3540                                 break;
3541 #endif
3542                 }
3543                 switch (opt) {
3544                 case IPOPT_RA:
3545 #ifdef DIAGNOSTIC
3546                         if (optlen != IPOPT_OFFSET + sizeof (uint16_t) ||
3547                             (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0))
3548                                 break;
3549                         else
3550 #endif
3551                                 found_ra = 1;
3552                         break;
3553                 default:
3554                         break;
3555                 }
3556         }
3557
3558         return (found_ra);
3559 }
3560
3561 /*
3562  * Given address of next destination (final or next hop),
3563  * return internet address info of interface to be used to get there.
3564  */
3565 struct in_ifaddr *
3566 ip_rtaddr(struct in_addr dst)
3567 {
3568         struct sockaddr_in *sin;
3569         struct ifaddr *rt_ifa;
3570         struct route ro;
3571
3572         bzero(&ro, sizeof (ro));
3573         sin = SIN(&ro.ro_dst);
3574         sin->sin_family = AF_INET;
3575         sin->sin_len = sizeof (*sin);
3576         sin->sin_addr = dst;
3577
3578         rtalloc_ign(&ro, RTF_PRCLONING);
3579         if (ro.ro_rt == NULL) {
3580                 ROUTE_RELEASE(&ro);
3581                 return (NULL);
3582         }
3583
3584         RT_LOCK(ro.ro_rt);
3585         if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL)
3586                 IFA_ADDREF(rt_ifa);
3587         RT_UNLOCK(ro.ro_rt);
3588         ROUTE_RELEASE(&ro);
3589
3590         return ((struct in_ifaddr *)rt_ifa);
3591 }
3592
3593 /*
3594  * Save incoming source route for use in replies,
3595  * to be picked up later by ip_srcroute if the receiver is interested.
3596  */
3597 void
3598 save_rte(u_char *option, struct in_addr dst)
3599 {
3600         unsigned olen;
3601
3602         olen = option[IPOPT_OLEN];
3603 #if DIAGNOSTIC
3604         if (ipprintfs)
3605                 printf("save_rte: olen %d\n", olen);
3606 #endif
3607         if (olen > sizeof (ip_srcrt) - (1 + sizeof (dst)))
3608                 return;
3609         bcopy(option, ip_srcrt.srcopt, olen);
3610         ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof (struct in_addr);
3611         ip_srcrt.dst = dst;
3612 }
3613
3614 /*
3615  * Retrieve incoming source route for use in replies,
3616  * in the same form used by setsockopt.
3617  * The first hop is placed before the options, will be removed later.
3618  */
3619 struct mbuf *
3620 ip_srcroute(void)
3621 {
3622         struct in_addr *p, *q;
3623         struct mbuf *m;
3624
3625         if (ip_nhops == 0)
3626                 return (NULL);
3627
3628         m = m_get(M_DONTWAIT, MT_HEADER);
3629         if (m == NULL)
3630                 return (NULL);
3631
3632 #define OPTSIZ  (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt))
3633
3634         /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
3635         m->m_len = ip_nhops * sizeof (struct in_addr) +
3636             sizeof (struct in_addr) + OPTSIZ;
3637 #if DIAGNOSTIC
3638         if (ipprintfs)
3639                 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
3640 #endif
3641
3642         /*
3643          * First save first hop for return route
3644          */
3645         p = &ip_srcrt.route[ip_nhops - 1];
3646         *(mtod(m, struct in_addr *)) = *p--;
3647 #if DIAGNOSTIC
3648         if (ipprintfs)
3649                 printf(" hops %lx",
3650                     (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr));
3651 #endif
3652
3653         /*
3654          * Copy option fields and padding (nop) to mbuf.
3655          */
3656         ip_srcrt.nop = IPOPT_NOP;
3657         ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
3658         (void) memcpy(mtod(m, caddr_t) + sizeof (struct in_addr),
3659             &ip_srcrt.nop, OPTSIZ);
3660         q = (struct in_addr *)(void *)(mtod(m, caddr_t) +
3661             sizeof (struct in_addr) + OPTSIZ);
3662 #undef OPTSIZ
3663         /*
3664          * Record return path as an IP source route,
3665          * reversing the path (pointers are now aligned).
3666          */
3667         while (p >= ip_srcrt.route) {
3668 #if DIAGNOSTIC
3669                 if (ipprintfs)
3670                         printf(" %lx", (u_int32_t)ntohl(q->s_addr));
3671 #endif
3672                 *q++ = *p--;
3673         }
3674         /*
3675          * Last hop goes to final destination.
3676          */
3677         *q = ip_srcrt.dst;
3678 #if DIAGNOSTIC
3679         if (ipprintfs)
3680                 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr));
3681 #endif
3682         return (m);
3683 }
3684
3685 /*
3686  * Strip out IP options, at higher level protocol in the kernel.
3687  */
3688 void
3689 ip_stripoptions(struct mbuf *m)
3690 {
3691         int i;
3692         struct ip *ip = mtod(m, struct ip *);
3693         caddr_t opts;
3694         int olen;
3695
3696         /* Expect 32-bit aligned data pointer on strict-align platforms */
3697         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3698
3699         /* use bcopy() since it supports overlapping range */
3700         olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
3701         opts = (caddr_t)(ip + 1);
3702         i = m->m_len - (sizeof (struct ip) + olen);
3703         bcopy(opts + olen, opts, (unsigned)i);
3704         m->m_len -= olen;
3705         if (m->m_flags & M_PKTHDR)
3706                 m->m_pkthdr.len -= olen;
3707         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof (struct ip) >> 2);
3708
3709         /*
3710          * We expect ip_{off,len} to be in host order by now, and
3711          * that the original IP header length has been subtracted
3712          * out from ip_len.  Temporarily adjust ip_len for checksum
3713          * recalculation, and restore it afterwards.
3714          */
3715         ip->ip_len += sizeof (struct ip);
3716
3717         /* recompute checksum now that IP header is smaller */
3718 #if BYTE_ORDER != BIG_ENDIAN
3719         HTONS(ip->ip_len);
3720         HTONS(ip->ip_off);
3721 #endif /* BYTE_ORDER != BIG_ENDIAN */
3722         ip->ip_sum = in_cksum_hdr(ip);
3723 #if BYTE_ORDER != BIG_ENDIAN
3724         NTOHS(ip->ip_off);
3725         NTOHS(ip->ip_len);
3726 #endif /* BYTE_ORDER != BIG_ENDIAN */
3727
3728         ip->ip_len -= sizeof (struct ip);
3729 }
3730
3731 u_char inetctlerrmap[PRC_NCMDS] = {
3732         0,              0,              0,              0,
3733         0,              EMSGSIZE,       EHOSTDOWN,      EHOSTUNREACH,
3734         ENETUNREACH,    EHOSTUNREACH,   ECONNREFUSED,   ECONNREFUSED,
3735         EMSGSIZE,       EHOSTUNREACH,   0,              0,
3736         0,              0,              0,              0,
3737         ENOPROTOOPT,    ECONNREFUSED
3738 };
3739
3740 static int
3741 sysctl_ipforwarding SYSCTL_HANDLER_ARGS
3742 {
3743 #pragma unused(arg1, arg2)
3744         int i, was_ipforwarding = ipforwarding;
3745
3746         i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
3747         if (i != 0 || req->newptr == USER_ADDR_NULL)
3748                 return (i);
3749
3750         if (was_ipforwarding && !ipforwarding) {
3751                 /* clean up IPv4 forwarding cached routes */
3752                 ifnet_head_lock_shared();
3753                 for (i = 0; i <= if_index; i++) {
3754                         struct ifnet *ifp = ifindex2ifnet[i];
3755                         if (ifp != NULL) {
3756                                 lck_mtx_lock(&ifp->if_cached_route_lock);
3757                                 ROUTE_RELEASE(&ifp->if_fwd_route);
3758                                 bzero(&ifp->if_fwd_route,
3759                                     sizeof (ifp->if_fwd_route));
3760                                 lck_mtx_unlock(&ifp->if_cached_route_lock);
3761                         }
3762                 }
3763                 ifnet_head_done();
3764         }
3765
3766         return (0);
3767 }
3768
3769 /*
3770  * Similar to inp_route_{copyout,copyin} routines except that these copy
3771  * out the cached IPv4 forwarding route from struct ifnet instead of the
3772  * inpcb.  See comments for those routines for explanations.
3773  */
3774 static void
3775 ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst)
3776 {
3777         struct route *src = &ifp->if_fwd_route;
3778
3779         lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3780         lck_mtx_convert_spin(&ifp->if_cached_route_lock);
3781
3782         /* Minor sanity check */
3783         if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
3784                 panic("%s: wrong or corrupted route: %p", __func__, src);
3785
3786         route_copyout(dst, src, sizeof (*dst));
3787
3788         lck_mtx_unlock(&ifp->if_cached_route_lock);
3789 }
3790
3791 static void
3792 ip_fwd_route_copyin(struct ifnet *ifp, struct route *src)
3793 {
3794         struct route *dst = &ifp->if_fwd_route;
3795
3796         lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3797         lck_mtx_convert_spin(&ifp->if_cached_route_lock);
3798
3799         /* Minor sanity check */
3800         if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
3801                 panic("%s: wrong or corrupted route: %p", __func__, src);
3802
3803         if (ifp->if_fwd_cacheok)
3804                 route_copyin(src, dst, sizeof (*src));
3805
3806         lck_mtx_unlock(&ifp->if_cached_route_lock);
3807 }
3808
3809 /*
3810  * Forward a packet.  If some error occurs return the sender
3811  * an icmp packet.  Note we can't always generate a meaningful
3812  * icmp message because icmp doesn't have a large enough repertoire
3813  * of codes and types.
3814  *
3815  * If not forwarding, just drop the packet.  This could be confusing
3816  * if ipforwarding was zero but some routing protocol was advancing
3817  * us as a gateway to somewhere.  However, we must let the routing
3818  * protocol deal with that.
3819  *
3820  * The srcrt parameter indicates whether the packet is being forwarded
3821  * via a source route.
3822  */
3823 static void
3824 ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
3825 {
3826 #if !IPFIREWALL
3827 #pragma unused(next_hop)
3828 #endif
3829         struct ip *ip = mtod(m, struct ip *);
3830         struct sockaddr_in *sin;
3831         struct rtentry *rt;
3832         struct route fwd_rt;
3833         int error, type = 0, code = 0;
3834         struct mbuf *mcopy;
3835         n_long dest;
3836         struct in_addr pkt_dst;
3837         u_int32_t nextmtu = 0, len;
3838         struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0,
3839             SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC };
3840         struct ifnet *rcvifp = m->m_pkthdr.rcvif;
3841 #if IPSEC
3842         struct secpolicy *sp = NULL;
3843         int ipsecerror;
3844 #endif /* IPSEC */
3845 #if PF
3846         struct pf_mtag *pf_mtag;
3847 #endif /* PF */
3848
3849         dest = 0;
3850 #if IPFIREWALL
3851         /*
3852          * Cache the destination address of the packet; this may be
3853          * changed by use of 'ipfw fwd'.
3854          */
3855         pkt_dst = ((next_hop != NULL) ? next_hop->sin_addr : ip->ip_dst);
3856 #else /* !IPFIREWALL */
3857         pkt_dst = ip->ip_dst;
3858 #endif /* !IPFIREWALL */
3859
3860 #if DIAGNOSTIC
3861         if (ipprintfs)
3862                 printf("forward: src %lx dst %lx ttl %x\n",
3863                     (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr,
3864                     ip->ip_ttl);
3865 #endif
3866
3867         if (m->m_flags & (M_BCAST|M_MCAST) || !in_canforward(pkt_dst)) {
3868                 OSAddAtomic(1, &ipstat.ips_cantforward);
3869                 m_freem(m);
3870                 return;
3871         }
3872 #if IPSTEALTH
3873         if (!ipstealth) {
3874 #endif /* IPSTEALTH */
3875                 if (ip->ip_ttl <= IPTTLDEC) {
3876                         icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
3877                             dest, 0);
3878                         return;
3879                 }
3880 #if IPSTEALTH
3881         }
3882 #endif /* IPSTEALTH */
3883
3884 #if PF
3885         pf_mtag = pf_find_mtag(m);
3886         if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
3887                 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid;
3888                 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
3889         }
3890 #endif /* PF */
3891
3892         ip_fwd_route_copyout(rcvifp, &fwd_rt);
3893
3894         sin = SIN(&fwd_rt.ro_dst);
3895         if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) {
3896                 ROUTE_RELEASE(&fwd_rt);
3897
3898                 sin->sin_family = AF_INET;
3899                 sin->sin_len = sizeof (*sin);
3900                 sin->sin_addr = pkt_dst;
3901
3902                 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif);
3903                 if (fwd_rt.ro_rt == NULL) {
3904                         icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
3905                         goto done;
3906                 }
3907         }
3908         rt = fwd_rt.ro_rt;
3909
3910         /*
3911          * Save the IP header and at most 8 bytes of the payload,
3912          * in case we need to generate an ICMP message to the src.
3913          *
3914          * We don't use m_copy() because it might return a reference
3915          * to a shared cluster. Both this function and ip_output()
3916          * assume exclusive access to the IP header in `m', so any
3917          * data in a cluster may change before we reach icmp_error().
3918          */
3919         MGET(mcopy, M_DONTWAIT, m->m_type);
3920         if (mcopy != NULL) {
3921                 M_COPY_PKTHDR(mcopy, m);
3922                 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
3923                     (int)ip->ip_len);
3924                 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
3925         }
3926
3927 #if IPSTEALTH
3928         if (!ipstealth) {
3929 #endif /* IPSTEALTH */
3930                 ip->ip_ttl -= IPTTLDEC;
3931 #if IPSTEALTH
3932         }
3933 #endif /* IPSTEALTH */
3934
3935         /*
3936          * If forwarding packet using same interface that it came in on,
3937          * perhaps should send a redirect to sender to shortcut a hop.
3938          * Only send redirect if source is sending directly to us,
3939          * and if packet was not source routed (or has any options).
3940          * Also, don't send redirect if forwarding using a default route
3941          * or a route modified by a redirect.
3942          */
3943         RT_LOCK_SPIN(rt);
3944         if (rt->rt_ifp == m->m_pkthdr.rcvif &&
3945             !(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) &&
3946             satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
3947             ipsendredirects && !srcrt && rt->rt_ifa != NULL) {
3948                 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa;
3949                 u_int32_t src = ntohl(ip->ip_src.s_addr);
3950
3951                 /* Become a regular mutex */
3952                 RT_CONVERT_LOCK(rt);
3953                 IFA_LOCK_SPIN(&ia->ia_ifa);
3954                 if ((src & ia->ia_subnetmask) == ia->ia_subnet) {
3955                         if (rt->rt_flags & RTF_GATEWAY)
3956                                 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
3957                         else
3958                                 dest = pkt_dst.s_addr;
3959                         /*
3960                          * Router requirements says to only send
3961                          * host redirects.
3962                          */
3963                         type = ICMP_REDIRECT;
3964                         code = ICMP_REDIRECT_HOST;
3965 #if DIAGNOSTIC
3966                         if (ipprintfs)
3967                                 printf("redirect (%d) to %lx\n", code,
3968                                     (u_int32_t)dest);
3969 #endif
3970                 }
3971                 IFA_UNLOCK(&ia->ia_ifa);
3972         }
3973         RT_UNLOCK(rt);
3974
3975 #if IPFIREWALL
3976         if (next_hop != NULL) {
3977                 /* Pass IPFORWARD info if available */
3978                 struct m_tag *tag;
3979                 struct ip_fwd_tag *ipfwd_tag;
3980
3981                 tag = m_tag_create(KERNEL_MODULE_TAG_ID,
3982                     KERNEL_TAG_TYPE_IPFORWARD,
3983                     sizeof (*ipfwd_tag), M_NOWAIT, m);
3984                 if (tag == NULL) {
3985                         error = ENOBUFS;
3986                         m_freem(m);
3987                         goto done;
3988                 }
3989
3990                 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
3991                 ipfwd_tag->next_hop = next_hop;
3992
3993                 m_tag_prepend(m, tag);
3994         }
3995 #endif /* IPFIREWALL */
3996
3997         /* Mark this packet as being forwarded from another interface */
3998         m->m_pkthdr.pkt_flags |= PKTF_FORWARDED;
3999         len = m_pktlen(m);
4000
4001         error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS,
4002             NULL, &ipoa);
4003
4004         /* Refresh rt since the route could have changed while in IP */
4005         rt = fwd_rt.ro_rt;
4006
4007         if (error != 0) {
4008                 OSAddAtomic(1, &ipstat.ips_cantforward);
4009         } else {
4010                 /*
4011                  * Increment stats on the source interface; the ones
4012                  * for destination interface has been taken care of
4013                  * during output above by virtue of PKTF_FORWARDED.
4014                  */
4015                 rcvifp->if_fpackets++;
4016                 rcvifp->if_fbytes += len;
4017
4018                 OSAddAtomic(1, &ipstat.ips_forward);
4019                 if (type != 0) {
4020                         OSAddAtomic(1, &ipstat.ips_redirectsent);
4021                 } else {
4022                         if (mcopy != NULL) {
4023                                 /*
4024                                  * If we didn't have to go thru ipflow and
4025                                  * the packet was successfully consumed by
4026                                  * ip_output, the mcopy is rather a waste;
4027                                  * this could be further optimized.
4028                                  */
4029                                 m_freem(mcopy);
4030                         }
4031                         goto done;
4032                 }
4033         }
4034         if (mcopy == NULL)
4035                 goto done;
4036
4037         switch (error) {
4038         case 0:                         /* forwarded, but need redirect */
4039                 /* type, code set above */
4040                 break;
4041
4042         case ENETUNREACH:               /* shouldn't happen, checked above */
4043         case EHOSTUNREACH:
4044         case ENETDOWN:
4045         case EHOSTDOWN:
4046         default:
4047                 type = ICMP_UNREACH;
4048                 code = ICMP_UNREACH_HOST;
4049                 break;
4050
4051         case EMSGSIZE:
4052                 type = ICMP_UNREACH;
4053                 code = ICMP_UNREACH_NEEDFRAG;
4054
4055                 if (rt == NULL) {
4056                         break;
4057                 } else {
4058                         RT_LOCK_SPIN(rt);
4059                         if (rt->rt_ifp != NULL)
4060                                 nextmtu = rt->rt_ifp->if_mtu;
4061                         RT_UNLOCK(rt);
4062                 }
4063 #ifdef IPSEC
4064                 if (ipsec_bypass)
4065                         break;
4066
4067                 /*
4068                  * If the packet is routed over IPsec tunnel, tell the
4069                  * originator the tunnel MTU.
4070                  *      tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
4071                  * XXX quickhack!!!
4072                  */
4073                 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
4074                     IP_FORWARDING, &ipsecerror);
4075
4076                 if (sp == NULL)
4077                         break;
4078
4079                 /*
4080                  * find the correct route for outer IPv4
4081                  * header, compute tunnel MTU.
4082                  */
4083                 nextmtu = 0;
4084
4085                 if (sp->req != NULL &&
4086                     sp->req->saidx.mode == IPSEC_MODE_TUNNEL) {
4087                         struct secasindex saidx;
4088                         struct secasvar *sav;
4089                         struct route *ro;
4090                         struct ip *ipm;
4091                         int ipsechdr;
4092
4093                         /* count IPsec header size */
4094                         ipsechdr = ipsec_hdrsiz(sp);
4095
4096                         ipm = mtod(mcopy, struct ip *);
4097                         bcopy(&sp->req->saidx, &saidx, sizeof (saidx));
4098                         saidx.mode = sp->req->saidx.mode;
4099                         saidx.reqid = sp->req->saidx.reqid;
4100                         sin = SIN(&saidx.src);
4101                         if (sin->sin_len == 0) {
4102                                 sin->sin_len = sizeof (*sin);
4103                                 sin->sin_family = AF_INET;
4104                                 sin->sin_port = IPSEC_PORT_ANY;
4105                                 bcopy(&ipm->ip_src, &sin->sin_addr,
4106                                     sizeof (sin->sin_addr));
4107                         }
4108                         sin = SIN(&saidx.dst);
4109                         if (sin->sin_len == 0) {
4110                                 sin->sin_len = sizeof (*sin);
4111                                 sin->sin_family = AF_INET;
4112                                 sin->sin_port = IPSEC_PORT_ANY;
4113                                 bcopy(&ipm->ip_dst, &sin->sin_addr,
4114                                     sizeof (sin->sin_addr));
4115                         }
4116                         sav = key_allocsa_policy(&saidx);
4117                         if (sav != NULL) {
4118                                 lck_mtx_lock(sadb_mutex);
4119                                 if (sav->sah != NULL) {
4120                                         ro = &sav->sah->sa_route;
4121                                         if (ro->ro_rt != NULL) {
4122                                                 RT_LOCK(ro->ro_rt);
4123                                                 if (ro->ro_rt->rt_ifp != NULL) {
4124                                                         nextmtu = ro->ro_rt->
4125                                                             rt_ifp->if_mtu;
4126                                                         nextmtu -= ipsechdr;
4127                                                 }
4128                                                 RT_UNLOCK(ro->ro_rt);
4129                                         }
4130                                 }
4131                                 key_freesav(sav, KEY_SADB_LOCKED);
4132                                 lck_mtx_unlock(sadb_mutex);
4133                         }
4134                 }
4135                 key_freesp(sp, KEY_SADB_UNLOCKED);
4136 #endif /* IPSEC */
4137                 break;
4138
4139         case ENOBUFS:
4140                 /*
4141                  * A router should not generate ICMP_SOURCEQUENCH as
4142                  * required in RFC1812 Requirements for IP Version 4 Routers.
4143                  * Source quench could be a big problem under DoS attacks,
4144                  * or if the underlying interface is rate-limited.
4145                  * Those who need source quench packets may re-enable them
4146                  * via the net.inet.ip.sendsourcequench sysctl.
4147                  */
4148                 if (ip_sendsourcequench == 0) {
4149                         m_freem(mcopy);
4150                         goto done;
4151                 } else {
4152                         type = ICMP_SOURCEQUENCH;
4153                         code = 0;
4154                 }
4155                 break;
4156
4157         case EACCES:                    /* ipfw denied packet */
4158                 m_freem(mcopy);
4159                 goto done;
4160         }
4161
4162         if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG)
4163                 OSAddAtomic(1, &ipstat.ips_cantfrag);
4164
4165         icmp_error(mcopy, type, code, dest, nextmtu);
4166 done:
4167         ip_fwd_route_copyin(rcvifp, &fwd_rt);
4168 }
4169
4170 int
4171 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
4172     struct mbuf *m)
4173 {
4174         *mp = NULL;
4175         if (inp->inp_socket->so_options & SO_TIMESTAMP) {
4176                 struct timeval tv;
4177
4178                 getmicrotime(&tv);
4179                 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof (tv),
4180                     SCM_TIMESTAMP, SOL_SOCKET, mp);
4181                 if (*mp == NULL) {
4182                         goto no_mbufs;
4183                 }
4184         }
4185         if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) {
4186                 uint64_t time;
4187
4188                 time = mach_absolute_time();
4189                 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time),
4190                     SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp);
4191                 if (*mp == NULL) {
4192                         goto no_mbufs;
4193                 }
4194         }
4195         if (inp->inp_flags & INP_RECVDSTADDR) {
4196                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst,
4197                     sizeof (struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp);
4198                 if (*mp == NULL) {
4199                         goto no_mbufs;
4200                 }
4201         }
4202 #ifdef notyet
4203         /*
4204          * XXX
4205          * Moving these out of udp_input() made them even more broken
4206          * than they already were.
4207          */
4208         /* options were tossed already */
4209         if (inp->inp_flags & INP_RECVOPTS) {
4210                 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above,
4211                     sizeof (struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp);
4212                 if (*mp == NULL) {
4213                         goto no_mbufs;
4214                 }
4215         }
4216         /* ip_srcroute doesn't do what we want here, need to fix */
4217         if (inp->inp_flags & INP_RECVRETOPTS) {
4218                 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(),
4219                     sizeof (struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp);
4220                 if (*mp == NULL) {
4221                         goto no_mbufs;
4222                 }
4223         }
4224 #endif /* notyet */
4225         if (inp->inp_flags & INP_RECVIF) {
4226                 struct ifnet *ifp;
4227                 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1];
4228                 struct sockaddr_dl *sdl2 = SDL(&sdlbuf);
4229
4230                 /*
4231                  * Make sure to accomodate the largest possible
4232                  * size of SA(if_lladdr)->sa_len.
4233                  */
4234                 _CASSERT(sizeof (sdlbuf) == (SOCK_MAXADDRLEN + 1));
4235
4236                 ifnet_head_lock_shared();
4237                 if ((ifp = m->m_pkthdr.rcvif) != NULL &&
4238                     ifp->if_index && (ifp->if_index <= if_index)) {
4239                         struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1];
4240                         struct sockaddr_dl *sdp;
4241
4242                         if (!ifa || !ifa->ifa_addr)
4243                                 goto makedummy;
4244
4245                         IFA_LOCK_SPIN(ifa);
4246                         sdp = SDL(ifa->ifa_addr);
4247                         /*
4248                          * Change our mind and don't try copy.
4249                          */
4250                         if (sdp->sdl_family != AF_LINK) {
4251                                 IFA_UNLOCK(ifa);
4252                                 goto makedummy;
4253                         }
4254                         /* the above _CASSERT ensures sdl_len fits in sdlbuf */
4255                         bcopy(sdp, sdl2, sdp->sdl_len);
4256                         IFA_UNLOCK(ifa);
4257                 } else {
4258 makedummy:
4259                         sdl2->sdl_len =
4260                             offsetof(struct sockaddr_dl, sdl_data[0]);
4261                         sdl2->sdl_family = AF_LINK;
4262                         sdl2->sdl_index = 0;
4263                         sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
4264                 }
4265                 ifnet_head_done();
4266                 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len,
4267                     IP_RECVIF, IPPROTO_IP, mp);
4268                 if (*mp == NULL) {
4269                         goto no_mbufs;
4270                 }
4271         }
4272         if (inp->inp_flags & INP_RECVTTL) {
4273                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl,
4274                     sizeof (ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp);
4275                 if (*mp == NULL) {
4276                         goto no_mbufs;
4277                 }
4278         }
4279         if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) {
4280                 int tc = m_get_traffic_class(m);
4281
4282                 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof (tc),
4283                     SO_TRAFFIC_CLASS, SOL_SOCKET, mp);
4284                 if (*mp == NULL) {
4285                         goto no_mbufs;
4286                 }
4287         }
4288         if (inp->inp_flags & INP_PKTINFO) {
4289                 struct in_pktinfo pi;
4290
4291                 bzero(&pi, sizeof (struct in_pktinfo));
4292                 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof (struct in_addr));
4293                 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ?
4294                     m->m_pkthdr.rcvif->if_index : 0;
4295
4296                 mp = sbcreatecontrol_mbuf((caddr_t)&pi,
4297                     sizeof (struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp);
4298                 if (*mp == NULL) {
4299                         goto no_mbufs;
4300                 }
4301         }
4302         if (inp->inp_flags & INP_RECVTOS) {
4303                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_tos,
4304                     sizeof(u_char), IP_RECVTOS, IPPROTO_IP, mp);
4305                 if (*mp == NULL) {
4306                         goto no_mbufs;
4307                 }
4308         }
4309         return (0);
4310
4311 no_mbufs:
4312         ipstat.ips_pktdropcntrl++;
4313         return (ENOBUFS);
4314 }
4315
4316 static inline u_short
4317 ip_cksum(struct mbuf *m, int hlen)
4318 {
4319         u_short sum;
4320
4321         if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
4322                 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
4323         } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
4324             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
4325                 /*
4326                  * The packet arrived on an interface which isn't capable
4327                  * of performing IP header checksum; compute it now.
4328                  */
4329                 sum = ip_cksum_hdr_in(m, hlen);
4330         } else {
4331                 sum = 0;
4332                 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
4333                     CSUM_IP_CHECKED | CSUM_IP_VALID);
4334                 m->m_pkthdr.csum_data = 0xffff;
4335         }
4336
4337         if (sum != 0)
4338                 OSAddAtomic(1, &ipstat.ips_badsum);
4339
4340         return (sum);
4341 }
4342
4343 static int
4344 ip_getstat SYSCTL_HANDLER_ARGS
4345 {
4346 #pragma unused(oidp, arg1, arg2)
4347         if (req->oldptr == USER_ADDR_NULL)
4348                 req->oldlen = (size_t)sizeof (struct ipstat);
4349
4350         return (SYSCTL_OUT(req, &ipstat, MIN(sizeof (ipstat), req->oldlen)));
4351 }
4352
4353 void
4354 ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia)
4355 {
4356         VERIFY(m->m_flags & M_PKTHDR);
4357
4358         /*
4359          * If the source ifaddr is specified, pick up the information
4360          * from there; otherwise just grab the passed-in ifindex as the
4361          * caller may not have the ifaddr available.
4362          */
4363         if (ia != NULL) {
4364                 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4365                 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index;
4366         } else {
4367                 m->m_pkthdr.src_ifindex = src_idx;
4368                 if (src_idx != 0)
4369                         m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4370         }
4371 }
4372
4373 void
4374 ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia)
4375 {
4376         VERIFY(m->m_flags & M_PKTHDR);
4377
4378         /*
4379          * If the destination ifaddr is specified, pick up the information
4380          * from there; otherwise just grab the passed-in ifindex as the
4381          * caller may not have the ifaddr available.
4382          */
4383         if (ia != NULL) {
4384                 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4385                 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index;
4386         } else {
4387                 m->m_pkthdr.dst_ifindex = dst_idx;
4388                 if (dst_idx != 0)
4389                         m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4390         }
4391 }
4392
4393 int
4394 ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf)
4395 {
4396         VERIFY(m->m_flags & M_PKTHDR);
4397
4398         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4399                 return (-1);
4400
4401         if (src_idx != NULL)
4402                 *src_idx = m->m_pkthdr.src_ifindex;
4403
4404         if (iaf != NULL)
4405                 *iaf = 0;
4406
4407         return (0);
4408 }
4409
4410 int
4411 ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf)
4412 {
4413         VERIFY(m->m_flags & M_PKTHDR);
4414
4415         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4416                 return (-1);
4417
4418         if (dst_idx != NULL)
4419                 *dst_idx = m->m_pkthdr.dst_ifindex;
4420
4421         if (iaf != NULL)
4422                 *iaf = 0;
4423
4424         return (0);
4425 }
4426
4427 /*
4428  * Protocol input handler for IPPROTO_GRE.
4429  */
4430 void
4431 gre_input(struct mbuf *m, int off)
4432 {
4433         gre_input_func_t fn = gre_input_func;
4434
4435         /*
4436          * If there is a registered GRE input handler, pass mbuf to it.
4437          */
4438         if (fn != NULL) {
4439                 lck_mtx_unlock(inet_domain_mutex);
4440                 m = fn(m, off, (mtod(m, struct ip *))->ip_p);
4441                 lck_mtx_lock(inet_domain_mutex);
4442         }
4443
4444         /*
4445          * If no matching tunnel that is up is found, we inject
4446          * the mbuf to raw ip socket to see if anyone picks it up.
4447          */
4448         if (m != NULL)
4449                 rip_input(m, off);
4450 }
4451
4452 /*
4453  * Private KPI for PPP/PPTP.
4454  */
4455 int
4456 ip_gre_register_input(gre_input_func_t fn)
4457 {
4458         lck_mtx_lock(inet_domain_mutex);
4459         gre_input_func = fn;
4460         lck_mtx_unlock(inet_domain_mutex);
4461
4462         return (0);
4463 }
4464
4465 #if (DEBUG || DEVELOPMENT)
4466 static int
4467 sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS
4468 {
4469 #pragma unused(arg1, arg2)
4470         int error, i;
4471
4472         i = ip_input_measure;
4473         error = sysctl_handle_int(oidp, &i, 0, req);
4474         if (error || req->newptr == USER_ADDR_NULL)
4475                 goto done;
4476         /* impose bounds */
4477         if (i < 0 || i > 1) {
4478                 error = EINVAL;
4479                 goto done;
4480         }
4481         if (ip_input_measure != i && i == 1) {
4482                 net_perf_initialize(&net_perf, ip_input_measure_bins);
4483         }
4484         ip_input_measure = i;
4485 done:
4486         return (error);
4487 }
4488
4489 static int
4490 sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS
4491 {
4492 #pragma unused(arg1, arg2)
4493         int error;
4494         uint64_t i;
4495
4496         i = ip_input_measure_bins;
4497         error = sysctl_handle_quad(oidp, &i, 0, req);
4498         if (error || req->newptr == USER_ADDR_NULL)
4499                 goto done;
4500         /* validate data */
4501         if (!net_perf_validate_bins(i)) {
4502                 error = EINVAL;
4503                 goto done;
4504         }
4505         ip_input_measure_bins = i;
4506 done:
4507         return (error);
4508 }
4509
4510 static int
4511 sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS
4512 {
4513 #pragma unused(oidp, arg1, arg2)
4514         if (req->oldptr == USER_ADDR_NULL)
4515                 req->oldlen = (size_t)sizeof (struct ipstat);
4516
4517         return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen)));
4518 }
4519 #endif /* (DEBUG || DEVELOPMENT) */