bsd/netinet/ip_input.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)ip_input.c  8.2 (Berkeley) 1/4/94
  61  */
  62 /*
  63  * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce
  64  * support for mandatory and extensible security protections.  This notice
  65  * is included in support of clause 2.2 (b) of the Apple Public License,
  66  * Version 2.0.
  67  */
  68
  69 #define _IP_VHL
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/malloc.h>
  75 #include <sys/domain.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/time.h>
  79 #include <sys/kernel.h>
  80 #include <sys/syslog.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/mcache.h>
  83 #include <sys/socketvar.h>
  84 #include <sys/kdebug.h>
  85 #include <mach/mach_time.h>
  86 #include <mach/sdt.h>
  87
  88 #include <machine/endian.h>
  89 #include <dev/random/randomdev.h>
  90
  91 #include <kern/queue.h>
  92 #include <kern/locks.h>
  93 #include <libkern/OSAtomic.h>
  94
  95 #include <pexpert/pexpert.h>
  96
  97 #include <net/if.h>
  98 #include <net/if_var.h>
  99 #include <net/if_dl.h>
 100 #include <net/route.h>
 101 #include <net/kpi_protocol.h>
 102 #include <net/ntstat.h>
 103 #include <net/dlil.h>
 104 #include <net/classq/classq.h>
 105 #include <net/net_perf.h>
 106 #include <net/init.h>
 107 #if PF
 108 #include <net/pfvar.h>
 109 #endif /* PF */
 110
 111 #include <netinet/in.h>
 112 #include <netinet/in_systm.h>
 113 #include <netinet/in_var.h>
 114 #include <netinet/in_arp.h>
 115 #include <netinet/ip.h>
 116 #include <netinet/in_pcb.h>
 117 #include <netinet/ip_var.h>
 118 #include <netinet/ip_icmp.h>
 119 #include <netinet/kpi_ipfilter_var.h>
 120 #include <netinet/udp.h>
 121 #include <netinet/udp_var.h>
 122 #include <netinet/bootp.h>
 123
 124 #if DUMMYNET
 125 #include <netinet/ip_dummynet.h>
 126 #endif /* DUMMYNET */
 127
 128 #if IPSEC
 129 #include <netinet6/ipsec.h>
 130 #include <netkey/key.h>
 131 #endif /* IPSEC */
 132
 133 #include <os/log.h>
 134
 135 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETIP, 0)
 136 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETIP, 2)
 137 #define DBG_FNC_IP_INPUT        NETDBG_CODE(DBG_NETIP, (2 << 8))
 138
 139 #if IPSEC
 140 extern int ipsec_bypass;
 141 extern lck_mtx_t *sadb_mutex;
 142
 143 lck_grp_t       *sadb_stat_mutex_grp;
 144 lck_grp_attr_t  *sadb_stat_mutex_grp_attr;
 145 lck_attr_t      *sadb_stat_mutex_attr;
 146 decl_lck_mtx_data(, sadb_stat_mutex_data);
 147 lck_mtx_t       *sadb_stat_mutex = &sadb_stat_mutex_data;
 148 #endif /* IPSEC */
 149
 150 MBUFQ_HEAD(fq_head);
 151
 152 static int frag_timeout_run;            /* frag timer is scheduled to run */
 153 static void frag_timeout(void *);
 154 static void frag_sched_timeout(void);
 155
 156 static struct ipq *ipq_alloc(int);
 157 static void ipq_free(struct ipq *);
 158 static void ipq_updateparams(void);
 159 static void ip_input_second_pass(struct mbuf *, struct ifnet *,
 160     int, int, struct ip_fw_in_args *);
 161
 162 decl_lck_mtx_data(static, ipqlock);
 163 static lck_attr_t       *ipqlock_attr;
 164 static lck_grp_t        *ipqlock_grp;
 165 static lck_grp_attr_t   *ipqlock_grp_attr;
 166
 167 /* Packet reassembly stuff */
 168 #define IPREASS_NHASH_LOG2      6
 169 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
 170 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
 171 #define IPREASS_HASH(x, y) \
 172         (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
 173
 174 /* IP fragment reassembly queues (protected by ipqlock) */
 175 static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */
 176 static int maxnipq;                     /* max packets in reass queues */
 177 static u_int32_t maxfragsperpacket;     /* max frags/packet in reass queues */
 178 static u_int32_t nipq;                  /* # of packets in reass queues */
 179 static u_int32_t ipq_limit;             /* ipq allocation limit */
 180 static u_int32_t ipq_count;             /* current # of allocated ipq's */
 181
 182 static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS;
 183 static int sysctl_maxnipq SYSCTL_HANDLER_ARGS;
 184 static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS;
 185
 186 #if (DEBUG || DEVELOPMENT)
 187 static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS;
 188 static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS;
 189 static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS;
 190 #endif /* (DEBUG || DEVELOPMENT) */
 191
 192 int ipforwarding = 0;
 193 SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding,
 194     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0,
 195     sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces");
 196
 197 static int ipsendredirects = 1; /* XXX */
 198 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect,
 199     CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0,
 200     "Enable sending IP redirects");
 201
 202 int ip_defttl = IPDEFTTL;
 203 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED,
 204     &ip_defttl, 0, "Maximum TTL on IP packets");
 205
 206 static int ip_dosourceroute = 0;
 207 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
 208     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0,
 209     "Enable forwarding source routed IP packets");
 210
 211 static int ip_acceptsourceroute = 0;
 212 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
 213     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0,
 214     "Enable accepting source routed IP packets");
 215
 216 static int ip_sendsourcequench = 0;
 217 SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench,
 218     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0,
 219     "Enable the transmission of source quench packets");
 220
 221 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
 222     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq,
 223     "I", "Maximum number of IPv4 fragment reassembly queue entries");
 224
 225 SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED,
 226     &nipq, 0, "Current number of IPv4 fragment reassembly queue entries");
 227
 228 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket,
 229     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0,
 230     sysctl_maxfragsperpacket, "I",
 231     "Maximum number of IPv4 fragments allowed per packet");
 232
 233 static uint32_t ip_adj_clear_hwcksum = 0;
 234 SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum,
 235     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0,
 236     "Invalidate hwcksum info when adjusting length");
 237
 238 static uint32_t ip_adj_partial_sum = 1;
 239 SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_partial_sum,
 240     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_partial_sum, 0,
 241     "Perform partial sum adjustment of trailing bytes at IP layer");
 242
 243 /*
 244  * ip_checkinterface controls the receive side of the models for multihoming
 245  * that are discussed in RFC 1122.
 246  *
 247  * ip_checkinterface values are:
 248  *  IP_CHECKINTERFACE_WEAK_ES:
 249  *      This corresponds to the Weak End-System model where incoming packets from
 250  *      any interface are accepted provided the destination address of the incoming packet
 251  *      is assigned to some interface.
 252  *
 253  *  IP_CHECKINTERFACE_HYBRID_ES:
 254  *      The Hybrid End-System model use the Strong End-System for tunnel interfaces
 255  *      (ipsec and utun) and the weak End-System model for other interfaces families.
 256  *      This prevents a rogue middle box to probe for signs of TCP connections
 257  *      that use the tunnel interface.
 258  *
 259  *  IP_CHECKINTERFACE_STRONG_ES:
 260  *      The Strong model model requires the packet arrived on an interface that
 261  *      is assigned the destination address of the packet.
 262  *
 263  * Since the routing table and transmit implementation do not implement the Strong ES model,
 264  * setting this to a value different from IP_CHECKINTERFACE_WEAK_ES may lead to unexpected results.
 265  *
 266  * When forwarding is enabled, the system reverts to the Weak ES model as a router
 267  * is expected by design to receive packets from several interfaces to the same address.
 268  *
 269  * XXX - ip_checkinterface currently must be set to IP_CHECKINTERFACE_WEAK_ES if you use ipnat
 270  * to translate the destination address to another local interface.
 271  *
 272  * XXX - ip_checkinterface must be set to IP_CHECKINTERFACE_WEAK_ES if you add IP aliases
 273  * to the loopback interface instead of the interface where the
 274  * packets for those addresses are received.
 275  */
 276 #define IP_CHECKINTERFACE_WEAK_ES       0
 277 #define IP_CHECKINTERFACE_HYBRID_ES     1
 278 #define IP_CHECKINTERFACE_STRONG_ES     2
 279
 280 static int ip_checkinterface = IP_CHECKINTERFACE_HYBRID_ES;
 281
 282 static int sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS;
 283 SYSCTL_PROC(_net_inet_ip, OID_AUTO, check_interface,
 284     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 285     0, 0, sysctl_ip_checkinterface, "I", "Verify packet arrives on correct interface");
 286
 287 #if (DEBUG || DEVELOPMENT)
 288 #define IP_CHECK_IF_DEBUG 1
 289 #else
 290 #define IP_CHECK_IF_DEBUG 0
 291 #endif /* (DEBUG || DEVELOPMENT) */
 292 static int ip_checkinterface_debug = IP_CHECK_IF_DEBUG;
 293 SYSCTL_INT(_net_inet_ip, OID_AUTO, checkinterface_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
 294     &ip_checkinterface_debug, IP_CHECK_IF_DEBUG, "");
 295
 296 static int ip_chaining = 1;
 297 SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED,
 298     &ip_chaining, 1, "Do receive side ip address based chaining");
 299
 300 static int ip_chainsz = 6;
 301 SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED,
 302     &ip_chainsz, 1, "IP receive side max chaining");
 303
 304 #if (DEBUG || DEVELOPMENT)
 305 static int ip_input_measure = 0;
 306 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf,
 307     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 308     &ip_input_measure, 0, sysctl_reset_ip_input_stats, "I", "Do time measurement");
 309
 310 static uint64_t ip_input_measure_bins = 0;
 311 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins,
 312     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_input_measure_bins, 0,
 313     sysctl_ip_input_measure_bins, "I",
 314     "bins for chaining performance data histogram");
 315
 316 static net_perf_t net_perf;
 317 SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data,
 318     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 319     0, 0, sysctl_ip_input_getperf, "S,net_perf",
 320     "IP input performance data (struct net_perf, net/net_perf.h)");
 321 #endif /* (DEBUG || DEVELOPMENT) */
 322
 323 #if DIAGNOSTIC
 324 static int ipprintfs = 0;
 325 #endif
 326
 327 struct protosw *ip_protox[IPPROTO_MAX];
 328
 329 static lck_grp_attr_t   *in_ifaddr_rwlock_grp_attr;
 330 static lck_grp_t        *in_ifaddr_rwlock_grp;
 331 static lck_attr_t       *in_ifaddr_rwlock_attr;
 332 decl_lck_rw_data(, in_ifaddr_rwlock_data);
 333 lck_rw_t                *in_ifaddr_rwlock = &in_ifaddr_rwlock_data;
 334
 335 /* Protected by in_ifaddr_rwlock */
 336 struct in_ifaddrhead in_ifaddrhead;             /* first inet address */
 337 struct in_ifaddrhashhead *in_ifaddrhashtbl;     /* inet addr hash table  */
 338
 339 #define INADDR_NHASH    61
 340 static u_int32_t inaddr_nhash;                  /* hash table size */
 341 static u_int32_t inaddr_hashp;                  /* next largest prime */
 342
 343 static int ip_getstat SYSCTL_HANDLER_ARGS;
 344 struct ipstat ipstat;
 345 SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats,
 346     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
 347     0, 0, ip_getstat, "S,ipstat",
 348     "IP statistics (struct ipstat, netinet/ip_var.h)");
 349
 350 #if IPCTL_DEFMTU
 351 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED,
 352     &ip_mtu, 0, "Default MTU");
 353 #endif /* IPCTL_DEFMTU */
 354
 355 #if IPSTEALTH
 356 static int      ipstealth = 0;
 357 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED,
 358     &ipstealth, 0, "");
 359 #endif /* IPSTEALTH */
 360
 361 #if DUMMYNET
 362 ip_dn_io_t *ip_dn_io_ptr;
 363 #endif /* DUMMYNET */
 364
 365 SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal,
 366     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local");
 367
 368 struct ip_linklocal_stat ip_linklocal_stat;
 369 SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat,
 370     CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat,
 371     "Number of link local packets with TTL less than 255");
 372
 373 SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in,
 374     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input");
 375
 376 int ip_linklocal_in_allowbadttl = 1;
 377 SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl,
 378     CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0,
 379     "Allow incoming link local packets with TTL less than 255");
 380
 381
 382 /*
 383  * We need to save the IP options in case a protocol wants to respond
 384  * to an incoming packet over the same route if the packet got here
 385  * using IP source routing.  This allows connection establishment and
 386  * maintenance when the remote end is on a network that is not known
 387  * to us.
 388  */
 389 static int      ip_nhops = 0;
 390 static  struct ip_srcrt {
 391         struct  in_addr dst;                    /* final destination */
 392         char    nop;                            /* one NOP to align */
 393         char    srcopt[IPOPT_OFFSET + 1];       /* OPTVAL, OLEN and OFFSET */
 394         struct  in_addr route[MAX_IPOPTLEN / sizeof(struct in_addr)];
 395 } ip_srcrt;
 396
 397 static void in_ifaddrhashtbl_init(void);
 398 static void save_rte(u_char *, struct in_addr);
 399 static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *);
 400 static void ip_forward(struct mbuf *, int, struct sockaddr_in *);
 401 static void frag_freef(struct ipqhead *, struct ipq *);
 402 static struct mbuf *ip_reass(struct mbuf *);
 403 static void ip_fwd_route_copyout(struct ifnet *, struct route *);
 404 static void ip_fwd_route_copyin(struct ifnet *, struct route *);
 405 static inline u_short ip_cksum(struct mbuf *, int);
 406
 407 int ip_use_randomid = 1;
 408 SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED,
 409     &ip_use_randomid, 0, "Randomize IP packets IDs");
 410
 411 /*
 412  * On platforms which require strict alignment (currently for anything but
 413  * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not,
 414  * copy the contents of the mbuf chain into a new chain, and free the original
 415  * one.  Create some head room in the first mbuf of the new chain, in case
 416  * it's needed later on.
 417  */
 418 #if defined(__i386__) || defined(__x86_64__)
 419 #define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0)
 420 #else /* !__i386__ && !__x86_64__ */
 421 #define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do {                  \
 422         if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) {                     \
 423                 struct mbuf *_n;                                        \
 424                 struct ifnet *__ifp = (_ifp);                           \
 425                 atomic_add_64(&(__ifp)->if_alignerrs, 1);               \
 426                 if (((_m)->m_flags & M_PKTHDR) &&                       \
 427                     (_m)->m_pkthdr.pkt_hdr != NULL)                     \
 428                         (_m)->m_pkthdr.pkt_hdr = NULL;                  \
 429                 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT);        \
 430                 if (_n == NULL) {                                       \
 431                         atomic_add_32(&ipstat.ips_toosmall, 1);         \
 432                         m_freem(_m);                                    \
 433                         (_m) = NULL;                                    \
 434                         _action;                                        \
 435                 } else {                                                \
 436                         VERIFY(_n != (_m));                             \
 437                         (_m) = _n;                                      \
 438                 }                                                       \
 439         }                                                               \
 440 } while (0)
 441 #endif /* !__i386__ && !__x86_64__ */
 442
 443
 444 typedef enum ip_check_if_result {
 445         IP_CHECK_IF_NONE = 0,
 446         IP_CHECK_IF_OURS = 1,
 447         IP_CHECK_IF_DROP = 2,
 448         IP_CHECK_IF_FORWARD = 3
 449 } ip_check_if_result_t;
 450
 451 static ip_check_if_result_t ip_input_check_interface(struct mbuf **, struct ip *, struct ifnet *);
 452
 453 /*
 454  * GRE input handler function, settable via ip_gre_register_input() for PPTP.
 455  */
 456 static gre_input_func_t gre_input_func;
 457
 458 static void
 459 ip_init_delayed(void)
 460 {
 461         struct ifreq ifr;
 462         int error;
 463         struct sockaddr_in *sin;
 464
 465         bzero(&ifr, sizeof(ifr));
 466         strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name));
 467         sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr;
 468         sin->sin_len = sizeof(struct sockaddr_in);
 469         sin->sin_family = AF_INET;
 470         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 471         error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc);
 472         if (error) {
 473                 printf("%s: failed to initialise lo0's address, error=%d\n",
 474                     __func__, error);
 475         }
 476 }
 477
 478 /*
 479  * IP initialization: fill in IP protocol switch table.
 480  * All protocols not implemented in kernel go to raw IP protocol handler.
 481  */
 482 void
 483 ip_init(struct protosw *pp, struct domain *dp)
 484 {
 485         static int ip_initialized = 0;
 486         struct protosw *pr;
 487         struct timeval tv;
 488         int i;
 489
 490         domain_proto_mtx_lock_assert_held();
 491         VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
 492
 493         /* ipq_alloc() uses mbufs for IP fragment queue structures */
 494         _CASSERT(sizeof(struct ipq) <= _MLEN);
 495
 496         /*
 497          * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is
 498          * interchangeable with in_aliasreq; they must have the same size.
 499          */
 500         _CASSERT(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq));
 501
 502         if (ip_initialized) {
 503                 return;
 504         }
 505         ip_initialized = 1;
 506
 507         in_ifaddr_init();
 508
 509         in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init();
 510         in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock",
 511             in_ifaddr_rwlock_grp_attr);
 512         in_ifaddr_rwlock_attr = lck_attr_alloc_init();
 513         lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp,
 514             in_ifaddr_rwlock_attr);
 515
 516         TAILQ_INIT(&in_ifaddrhead);
 517         in_ifaddrhashtbl_init();
 518
 519         ip_moptions_init();
 520
 521         pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW);
 522         if (pr == NULL) {
 523                 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n",
 524                     __func__);
 525                 /* NOTREACHED */
 526         }
 527
 528         /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 529         for (i = 0; i < IPPROTO_MAX; i++) {
 530                 ip_protox[i] = pr;
 531         }
 532         /*
 533          * Cycle through IP protocols and put them into the appropriate place
 534          * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}.
 535          */
 536         VERIFY(dp == inetdomain && dp->dom_family == PF_INET);
 537         TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) {
 538                 VERIFY(pr->pr_domain == dp);
 539                 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) {
 540                         /* Be careful to only index valid IP protocols. */
 541                         if (pr->pr_protocol < IPPROTO_MAX) {
 542                                 ip_protox[pr->pr_protocol] = pr;
 543                         }
 544                 }
 545         }
 546
 547         /* IP fragment reassembly queue lock */
 548         ipqlock_grp_attr  = lck_grp_attr_alloc_init();
 549         ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr);
 550         ipqlock_attr = lck_attr_alloc_init();
 551         lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr);
 552
 553         lck_mtx_lock(&ipqlock);
 554         /* Initialize IP reassembly queue. */
 555         for (i = 0; i < IPREASS_NHASH; i++) {
 556                 TAILQ_INIT(&ipq[i]);
 557         }
 558
 559         maxnipq = nmbclusters / 32;
 560         maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */
 561         ipq_updateparams();
 562         lck_mtx_unlock(&ipqlock);
 563
 564         getmicrotime(&tv);
 565         ip_id = RandomULong() ^ tv.tv_usec;
 566         ip_initid();
 567
 568         ipf_init();
 569
 570         PE_parse_boot_argn("ip_checkinterface", &i, sizeof(i));
 571         switch (i) {
 572         case IP_CHECKINTERFACE_WEAK_ES:
 573         case IP_CHECKINTERFACE_HYBRID_ES:
 574         case IP_CHECKINTERFACE_STRONG_ES:
 575                 ip_checkinterface = i;
 576                 break;
 577         default:
 578                 break;
 579         }
 580
 581 #if IPSEC
 582         sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init();
 583         sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat",
 584             sadb_stat_mutex_grp_attr);
 585         sadb_stat_mutex_attr = lck_attr_alloc_init();
 586         lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp,
 587             sadb_stat_mutex_attr);
 588
 589 #endif
 590         arp_init();
 591         net_init_add(ip_init_delayed);
 592 }
 593
 594 /*
 595  * Initialize IPv4 source address hash table.
 596  */
 597 static void
 598 in_ifaddrhashtbl_init(void)
 599 {
 600         int i, k, p;
 601
 602         if (in_ifaddrhashtbl != NULL) {
 603                 return;
 604         }
 605
 606         PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash,
 607             sizeof(inaddr_nhash));
 608         if (inaddr_nhash == 0) {
 609                 inaddr_nhash = INADDR_NHASH;
 610         }
 611
 612         MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *,
 613             inaddr_nhash * sizeof(*in_ifaddrhashtbl),
 614             M_IFADDR, M_WAITOK | M_ZERO);
 615         if (in_ifaddrhashtbl == NULL) {
 616                 panic("in_ifaddrhashtbl_init allocation failed");
 617         }
 618
 619         /*
 620          * Generate the next largest prime greater than inaddr_nhash.
 621          */
 622         k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2;
 623         for (;;) {
 624                 p = 1;
 625                 for (i = 3; i * i <= k; i += 2) {
 626                         if (k % i == 0) {
 627                                 p = 0;
 628                         }
 629                 }
 630                 if (p == 1) {
 631                         break;
 632                 }
 633                 k += 2;
 634         }
 635         inaddr_hashp = k;
 636 }
 637
 638 u_int32_t
 639 inaddr_hashval(u_int32_t key)
 640 {
 641         /*
 642          * The hash index is the computed prime times the key modulo
 643          * the hash size, as documented in "Introduction to Algorithms"
 644          * (Cormen, Leiserson, Rivest).
 645          */
 646         if (inaddr_nhash > 1) {
 647                 return (key * inaddr_hashp) % inaddr_nhash;
 648         } else {
 649                 return 0;
 650         }
 651 }
 652
 653 __private_extern__ void
 654 ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto,
 655     ipfilter_t inject_ipfref)
 656 {
 657         struct ipfilter *filter;
 658         int seen = (inject_ipfref == NULL);
 659         int     changed_header = 0;
 660         struct ip *ip;
 661         void (*pr_input)(struct mbuf *, int len);
 662
 663         if (!TAILQ_EMPTY(&ipv4_filters)) {
 664                 ipf_ref();
 665                 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
 666                         if (seen == 0) {
 667                                 if ((struct ipfilter *)inject_ipfref == filter) {
 668                                         seen = 1;
 669                                 }
 670                         } else if (filter->ipf_filter.ipf_input) {
 671                                 errno_t result;
 672
 673                                 if (changed_header == 0) {
 674                                         /*
 675                                          * Perform IP header alignment fixup,
 676                                          * if needed, before passing packet
 677                                          * into filter(s).
 678                                          */
 679                                         IP_HDR_ALIGNMENT_FIXUP(m,
 680                                             m->m_pkthdr.rcvif, ipf_unref());
 681
 682                                         /* ipf_unref() already called */
 683                                         if (m == NULL) {
 684                                                 return;
 685                                         }
 686
 687                                         changed_header = 1;
 688                                         ip = mtod(m, struct ip *);
 689                                         ip->ip_len = htons(ip->ip_len + hlen);
 690                                         ip->ip_off = htons(ip->ip_off);
 691                                         ip->ip_sum = 0;
 692                                         ip->ip_sum = ip_cksum_hdr_in(m, hlen);
 693                                 }
 694                                 result = filter->ipf_filter.ipf_input(
 695                                         filter->ipf_filter.cookie, (mbuf_t *)&m,
 696                                         hlen, proto);
 697                                 if (result == EJUSTRETURN) {
 698                                         ipf_unref();
 699                                         return;
 700                                 }
 701                                 if (result != 0) {
 702                                         ipf_unref();
 703                                         m_freem(m);
 704                                         return;
 705                                 }
 706                         }
 707                 }
 708                 ipf_unref();
 709         }
 710
 711         /* Perform IP header alignment fixup (post-filters), if needed */
 712         IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return );
 713
 714         ip = mtod(m, struct ip *);
 715
 716         if (changed_header) {
 717                 ip->ip_len = ntohs(ip->ip_len) - hlen;
 718                 ip->ip_off = ntohs(ip->ip_off);
 719         }
 720
 721         /*
 722          * If there isn't a specific lock for the protocol
 723          * we're about to call, use the generic lock for AF_INET.
 724          * otherwise let the protocol deal with its own locking
 725          */
 726         if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) {
 727                 m_freem(m);
 728         } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) {
 729                 lck_mtx_lock(inet_domain_mutex);
 730                 pr_input(m, hlen);
 731                 lck_mtx_unlock(inet_domain_mutex);
 732         } else {
 733                 pr_input(m, hlen);
 734         }
 735 }
 736
 737 struct pktchain_elm {
 738         struct mbuf     *pkte_head;
 739         struct mbuf     *pkte_tail;
 740         struct in_addr  pkte_saddr;
 741         struct in_addr  pkte_daddr;
 742         uint16_t        pkte_npkts;
 743         uint16_t        pkte_proto;
 744         uint32_t        pkte_nbytes;
 745 };
 746
 747 typedef struct pktchain_elm pktchain_elm_t;
 748
 749 /* Store upto PKTTBL_SZ unique flows on the stack */
 750 #define PKTTBL_SZ       7
 751
 752 static struct mbuf *
 753 ip_chain_insert(struct mbuf *packet, pktchain_elm_t *tbl)
 754 {
 755         struct ip*      ip;
 756         int             pkttbl_idx = 0;
 757
 758         ip = mtod(packet, struct ip*);
 759
 760         /* reusing the hash function from inaddr_hashval */
 761         pkttbl_idx = inaddr_hashval(ntohs(ip->ip_src.s_addr)) % PKTTBL_SZ;
 762         if (tbl[pkttbl_idx].pkte_head == NULL) {
 763                 tbl[pkttbl_idx].pkte_head = packet;
 764                 tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr;
 765                 tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr;
 766                 tbl[pkttbl_idx].pkte_proto = ip->ip_p;
 767         } else {
 768                 if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) &&
 769                     (ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) &&
 770                     (ip->ip_p == tbl[pkttbl_idx].pkte_proto)) {
 771                 } else {
 772                         return packet;
 773                 }
 774         }
 775         if (tbl[pkttbl_idx].pkte_tail != NULL) {
 776                 mbuf_setnextpkt(tbl[pkttbl_idx].pkte_tail, packet);
 777         }
 778
 779         tbl[pkttbl_idx].pkte_tail = packet;
 780         tbl[pkttbl_idx].pkte_npkts += 1;
 781         tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len;
 782         return NULL;
 783 }
 784
 785 /* args is a dummy variable here for backward compatibility */
 786 static void
 787 ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args)
 788 {
 789         int i = 0;
 790
 791         for (i = 0; i < PKTTBL_SZ; i++) {
 792                 if (tbl[i].pkte_head != NULL) {
 793                         struct mbuf *m = tbl[i].pkte_head;
 794                         ip_input_second_pass(m, m->m_pkthdr.rcvif,
 795                             tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args);
 796
 797                         if (tbl[i].pkte_npkts > 2) {
 798                                 ipstat.ips_rxc_chainsz_gt2++;
 799                         }
 800                         if (tbl[i].pkte_npkts > 4) {
 801                                 ipstat.ips_rxc_chainsz_gt4++;
 802                         }
 803 #if (DEBUG || DEVELOPMENT)
 804                         if (ip_input_measure) {
 805                                 net_perf_histogram(&net_perf, tbl[i].pkte_npkts);
 806                         }
 807 #endif /* (DEBUG || DEVELOPMENT) */
 808                         tbl[i].pkte_head = tbl[i].pkte_tail = NULL;
 809                         tbl[i].pkte_npkts = 0;
 810                         tbl[i].pkte_nbytes = 0;
 811                         /* no need to initialize address and protocol in tbl */
 812                 }
 813         }
 814 }
 815
 816 static void
 817 ip_input_cpout_args(struct ip_fw_in_args *args, struct ip_fw_args *args1,
 818     boolean_t *done_init)
 819 {
 820         if (*done_init == FALSE) {
 821                 bzero(args1, sizeof(struct ip_fw_args));
 822                 *done_init = TRUE;
 823         }
 824         args1->fwa_pf_rule = args->fwai_pf_rule;
 825 }
 826
 827 static void
 828 ip_input_cpin_args(struct ip_fw_args *args1, struct ip_fw_in_args *args)
 829 {
 830         args->fwai_pf_rule = args1->fwa_pf_rule;
 831 }
 832
 833 typedef enum {
 834         IPINPUT_DOCHAIN = 0,
 835         IPINPUT_DONTCHAIN,
 836         IPINPUT_FREED,
 837         IPINPUT_DONE
 838 } ipinput_chain_ret_t;
 839
 840 static void
 841 ip_input_update_nstat(struct ifnet *ifp, struct in_addr src_ip,
 842     u_int32_t packets, u_int32_t bytes)
 843 {
 844         if (nstat_collect) {
 845                 struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp,
 846                     src_ip);
 847                 if (rt != NULL) {
 848                         nstat_route_rx(rt, packets, bytes, 0);
 849                         rtfree(rt);
 850                 }
 851         }
 852 }
 853
 854 static void
 855 ip_input_dispatch_chain(struct mbuf *m)
 856 {
 857         struct mbuf *tmp_mbuf = m;
 858         struct mbuf *nxt_mbuf = NULL;
 859         struct ip *ip = NULL;
 860         unsigned int hlen;
 861
 862         ip = mtod(tmp_mbuf, struct ip *);
 863         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 864         while (tmp_mbuf != NULL) {
 865                 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
 866                 mbuf_setnextpkt(tmp_mbuf, NULL);
 867                 ip_proto_dispatch_in(tmp_mbuf, hlen, ip->ip_p, 0);
 868                 tmp_mbuf = nxt_mbuf;
 869                 if (tmp_mbuf) {
 870                         ip = mtod(tmp_mbuf, struct ip *);
 871                         /* first mbuf of chain already has adjusted ip_len */
 872                         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
 873                         ip->ip_len -= hlen;
 874                 }
 875         }
 876 }
 877
 878 static void
 879 ip_input_setdst_chain(struct mbuf *m, uint32_t ifindex, struct in_ifaddr *ia)
 880 {
 881         struct mbuf *tmp_mbuf = m;
 882
 883         while (tmp_mbuf != NULL) {
 884                 ip_setdstifaddr_info(tmp_mbuf, ifindex, ia);
 885                 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
 886         }
 887 }
 888
 889 static void
 890 ip_input_adjust(struct mbuf *m, struct ip *ip, struct ifnet *inifp)
 891 {
 892         boolean_t adjust = TRUE;
 893
 894         ASSERT(m_pktlen(m) > ip->ip_len);
 895
 896         /*
 897          * Invalidate hardware checksum info if ip_adj_clear_hwcksum
 898          * is set; useful to handle buggy drivers.  Note that this
 899          * should not be enabled by default, as we may get here due
 900          * to link-layer padding.
 901          */
 902         if (ip_adj_clear_hwcksum &&
 903             (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 904             !(inifp->if_flags & IFF_LOOPBACK) &&
 905             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
 906                 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 907                 m->m_pkthdr.csum_data = 0;
 908                 ipstat.ips_adj_hwcsum_clr++;
 909         }
 910
 911         /*
 912          * If partial checksum information is available, subtract
 913          * out the partial sum of postpended extraneous bytes, and
 914          * update the checksum metadata accordingly.  By doing it
 915          * here, the upper layer transport only needs to adjust any
 916          * prepended extraneous bytes (else it will do both.)
 917          */
 918         if (ip_adj_partial_sum &&
 919             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
 920             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
 921                 m->m_pkthdr.csum_rx_val = m_adj_sum16(m,
 922                     m->m_pkthdr.csum_rx_start, m->m_pkthdr.csum_rx_start,
 923                     (ip->ip_len - m->m_pkthdr.csum_rx_start),
 924                     m->m_pkthdr.csum_rx_val);
 925         } else if ((m->m_pkthdr.csum_flags &
 926             (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
 927             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
 928                 /*
 929                  * If packet has partial checksum info and we decided not
 930                  * to subtract the partial sum of postpended extraneous
 931                  * bytes here (not the default case), leave that work to
 932                  * be handled by the other layers.  For now, only TCP, UDP
 933                  * layers are capable of dealing with this.  For all other
 934                  * protocols (including fragments), trim and ditch the
 935                  * partial sum as those layers might not implement partial
 936                  * checksumming (or adjustment) at all.
 937                  */
 938                 if ((ip->ip_off & (IP_MF | IP_OFFMASK)) == 0 &&
 939                     (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_UDP)) {
 940                         adjust = FALSE;
 941                 } else {
 942                         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 943                         m->m_pkthdr.csum_data = 0;
 944                         ipstat.ips_adj_hwcsum_clr++;
 945                 }
 946         }
 947
 948         if (adjust) {
 949                 ipstat.ips_adj++;
 950                 if (m->m_len == m->m_pkthdr.len) {
 951                         m->m_len = ip->ip_len;
 952                         m->m_pkthdr.len = ip->ip_len;
 953                 } else {
 954                         m_adj(m, ip->ip_len - m->m_pkthdr.len);
 955                 }
 956         }
 957 }
 958
 959 /*
 960  * First pass does all essential packet validation and places on a per flow
 961  * queue for doing operations that have same outcome for all packets of a flow.
 962  */
 963 static ipinput_chain_ret_t
 964 ip_input_first_pass(struct mbuf *m, struct ip_fw_in_args *args, struct mbuf **modm)
 965 {
 966         struct ip       *ip;
 967         struct ifnet    *inifp;
 968         unsigned int    hlen;
 969         int             retval = IPINPUT_DOCHAIN;
 970         int             len = 0;
 971         struct in_addr  src_ip;
 972 #if DUMMYNET
 973         struct m_tag            *copy;
 974         struct m_tag            *p;
 975         boolean_t               delete = FALSE;
 976         struct ip_fw_args       args1;
 977         boolean_t               init = FALSE;
 978 #endif /* DUMMYNET */
 979         ipfilter_t inject_filter_ref = NULL;
 980
 981         /* Check if the mbuf is still valid after interface filter processing */
 982         MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
 983         inifp = mbuf_pkthdr_rcvif(m);
 984         VERIFY(inifp != NULL);
 985
 986         /* Perform IP header alignment fixup, if needed */
 987         IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
 988
 989         m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
 990
 991 #if DUMMYNET
 992         /*
 993          * Don't bother searching for tag(s) if there's none.
 994          */
 995         if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
 996                 goto ipfw_tags_done;
 997         }
 998
 999         /* Grab info from mtags prepended to the chain */
1000         p = m_tag_first(m);
1001         while (p) {
1002                 if (p->m_tag_id == KERNEL_MODULE_TAG_ID) {
1003                         if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) {
1004                                 struct dn_pkt_tag *dn_tag;
1005
1006                                 dn_tag = (struct dn_pkt_tag *)(p + 1);
1007                                 args->fwai_pf_rule = dn_tag->dn_pf_rule;
1008                                 delete = TRUE;
1009                         }
1010
1011                         if (delete) {
1012                                 copy = p;
1013                                 p = m_tag_next(m, p);
1014                                 m_tag_delete(m, copy);
1015                         } else {
1016                                 p = m_tag_next(m, p);
1017                         }
1018                 } else {
1019                         p = m_tag_next(m, p);
1020                 }
1021         }
1022
1023 #if DIAGNOSTIC
1024         if (m == NULL || !(m->m_flags & M_PKTHDR)) {
1025                 panic("ip_input no HDR");
1026         }
1027 #endif
1028
1029         if (args->fwai_pf_rule) {
1030                 /* dummynet already filtered us */
1031                 ip = mtod(m, struct ip *);
1032                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1033                 inject_filter_ref = ipf_get_inject_filter(m);
1034                 if (args->fwai_pf_rule) {
1035                         goto check_with_pf;
1036                 }
1037         }
1038 ipfw_tags_done:
1039 #endif /* DUMMYNET */
1040
1041         /*
1042          * No need to process packet twice if we've already seen it.
1043          */
1044         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1045                 inject_filter_ref = ipf_get_inject_filter(m);
1046         }
1047         if (inject_filter_ref != NULL) {
1048                 ip = mtod(m, struct ip *);
1049                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1050
1051                 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1052                     struct ip *, ip, struct ifnet *, inifp,
1053                     struct ip *, ip, struct ip6_hdr *, NULL);
1054
1055                 ip->ip_len = ntohs(ip->ip_len) - hlen;
1056                 ip->ip_off = ntohs(ip->ip_off);
1057                 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1058                 return IPINPUT_DONE;
1059         }
1060
1061         if (m->m_pkthdr.len < sizeof(struct ip)) {
1062                 OSAddAtomic(1, &ipstat.ips_total);
1063                 OSAddAtomic(1, &ipstat.ips_tooshort);
1064                 m_freem(m);
1065                 return IPINPUT_FREED;
1066         }
1067
1068         if (m->m_len < sizeof(struct ip) &&
1069             (m = m_pullup(m, sizeof(struct ip))) == NULL) {
1070                 OSAddAtomic(1, &ipstat.ips_total);
1071                 OSAddAtomic(1, &ipstat.ips_toosmall);
1072                 return IPINPUT_FREED;
1073         }
1074
1075         ip = mtod(m, struct ip *);
1076         *modm = m;
1077
1078         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1079             ip->ip_p, ip->ip_off, ip->ip_len);
1080
1081         if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1082                 OSAddAtomic(1, &ipstat.ips_total);
1083                 OSAddAtomic(1, &ipstat.ips_badvers);
1084                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1085                 m_freem(m);
1086                 return IPINPUT_FREED;
1087         }
1088
1089         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1090         if (hlen < sizeof(struct ip)) {
1091                 OSAddAtomic(1, &ipstat.ips_total);
1092                 OSAddAtomic(1, &ipstat.ips_badhlen);
1093                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1094                 m_freem(m);
1095                 return IPINPUT_FREED;
1096         }
1097
1098         if (hlen > m->m_len) {
1099                 if ((m = m_pullup(m, hlen)) == NULL) {
1100                         OSAddAtomic(1, &ipstat.ips_total);
1101                         OSAddAtomic(1, &ipstat.ips_badhlen);
1102                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1103                         return IPINPUT_FREED;
1104                 }
1105                 ip = mtod(m, struct ip *);
1106                 *modm = m;
1107         }
1108
1109         /* 127/8 must not appear on wire - RFC1122 */
1110         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1111             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1112                 /*
1113                  * Allow for the following exceptions:
1114                  *
1115                  *   1. If the packet was sent to loopback (i.e. rcvif
1116                  *      would have been set earlier at output time.)
1117                  *
1118                  *   2. If the packet was sent out on loopback from a local
1119                  *      source address which belongs to a non-loopback
1120                  *      interface (i.e. rcvif may not necessarily be a
1121                  *      loopback interface, hence the test for PKTF_LOOP.)
1122                  *      Unlike IPv6, there is no interface scope ID, and
1123                  *      therefore we don't care so much about PKTF_IFINFO.
1124                  */
1125                 if (!(inifp->if_flags & IFF_LOOPBACK) &&
1126                     !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1127                         OSAddAtomic(1, &ipstat.ips_total);
1128                         OSAddAtomic(1, &ipstat.ips_badaddr);
1129                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1130                         m_freem(m);
1131                         return IPINPUT_FREED;
1132                 }
1133         }
1134
1135         /* IPv4 Link-Local Addresses as defined in RFC3927 */
1136         if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
1137             IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1138                 ip_linklocal_stat.iplls_in_total++;
1139                 if (ip->ip_ttl != MAXTTL) {
1140                         OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
1141                         /* Silently drop link local traffic with bad TTL */
1142                         if (!ip_linklocal_in_allowbadttl) {
1143                                 OSAddAtomic(1, &ipstat.ips_total);
1144                                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1145                                 m_freem(m);
1146                                 return IPINPUT_FREED;
1147                         }
1148                 }
1149         }
1150
1151         if (ip_cksum(m, hlen)) {
1152                 OSAddAtomic(1, &ipstat.ips_total);
1153                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1154                 m_freem(m);
1155                 return IPINPUT_FREED;
1156         }
1157
1158         DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1159             struct ip *, ip, struct ifnet *, inifp,
1160             struct ip *, ip, struct ip6_hdr *, NULL);
1161
1162         /*
1163          * Convert fields to host representation.
1164          */
1165 #if BYTE_ORDER != BIG_ENDIAN
1166         NTOHS(ip->ip_len);
1167 #endif
1168
1169         if (ip->ip_len < hlen) {
1170                 OSAddAtomic(1, &ipstat.ips_total);
1171                 OSAddAtomic(1, &ipstat.ips_badlen);
1172                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1173                 m_freem(m);
1174                 return IPINPUT_FREED;
1175         }
1176
1177 #if BYTE_ORDER != BIG_ENDIAN
1178         NTOHS(ip->ip_off);
1179 #endif
1180
1181         /*
1182          * Check that the amount of data in the buffers
1183          * is as at least much as the IP header would have us expect.
1184          * Trim mbufs if longer than we expect.
1185          * Drop packet if shorter than we expect.
1186          */
1187         if (m->m_pkthdr.len < ip->ip_len) {
1188                 OSAddAtomic(1, &ipstat.ips_total);
1189                 OSAddAtomic(1, &ipstat.ips_tooshort);
1190                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1191                 m_freem(m);
1192                 return IPINPUT_FREED;
1193         }
1194
1195         if (m->m_pkthdr.len > ip->ip_len) {
1196                 ip_input_adjust(m, ip, inifp);
1197         }
1198
1199         /* for netstat route statistics */
1200         src_ip = ip->ip_src;
1201         len = m->m_pkthdr.len;
1202
1203 #if DUMMYNET
1204 check_with_pf:
1205 #endif /* DUMMYNET */
1206 #if PF
1207         /* Invoke inbound packet filter */
1208         if (PF_IS_ENABLED) {
1209                 int error;
1210                 ip_input_cpout_args(args, &args1, &init);
1211                 ip = mtod(m, struct ip *);
1212                 src_ip = ip->ip_src;
1213
1214 #if DUMMYNET
1215                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1);
1216 #else
1217                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1218 #endif /* DUMMYNET */
1219                 if (error != 0 || m == NULL) {
1220                         if (m != NULL) {
1221                                 panic("%s: unexpected packet %p\n",
1222                                     __func__, m);
1223                                 /* NOTREACHED */
1224                         }
1225                         /* Already freed by callee */
1226                         ip_input_update_nstat(inifp, src_ip, 1, len);
1227                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1228                         OSAddAtomic(1, &ipstat.ips_total);
1229                         return IPINPUT_FREED;
1230                 }
1231                 ip = mtod(m, struct ip *);
1232                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1233                 *modm = m;
1234                 ip_input_cpin_args(&args1, args);
1235         }
1236 #endif /* PF */
1237
1238 #if IPSEC
1239         if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
1240                 retval = IPINPUT_DONTCHAIN; /* XXX scope for chaining here? */
1241                 goto pass;
1242         }
1243 #endif
1244
1245 #if IPSEC
1246 pass:
1247 #endif
1248         /*
1249          * Process options and, if not destined for us,
1250          * ship it on.  ip_dooptions returns 1 when an
1251          * error was detected (causing an icmp message
1252          * to be sent and the original packet to be freed).
1253          */
1254         ip_nhops = 0;           /* for source routed packets */
1255         if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
1256                 ip_input_update_nstat(inifp, src_ip, 1, len);
1257                 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1258                 OSAddAtomic(1, &ipstat.ips_total);
1259                 return IPINPUT_FREED;
1260         }
1261
1262         /*
1263          * Don't chain fragmented packets
1264          */
1265         if (ip->ip_off & ~(IP_DF | IP_RF)) {
1266                 return IPINPUT_DONTCHAIN;
1267         }
1268
1269         /* Allow DHCP/BootP responses through */
1270         if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1271             hlen == sizeof(struct ip) && ip->ip_p == IPPROTO_UDP) {
1272                 struct udpiphdr *ui;
1273
1274                 if (m->m_len < sizeof(struct udpiphdr) &&
1275                     (m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
1276                         OSAddAtomic(1, &udpstat.udps_hdrops);
1277                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1278                         OSAddAtomic(1, &ipstat.ips_total);
1279                         return IPINPUT_FREED;
1280                 }
1281                 *modm = m;
1282                 ui = mtod(m, struct udpiphdr *);
1283                 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1284                         ip_setdstifaddr_info(m, inifp->if_index, NULL);
1285                         return IPINPUT_DONTCHAIN;
1286                 }
1287         }
1288
1289         /* Avoid chaining raw sockets as ipsec checks occur later for them */
1290         if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) {
1291                 return IPINPUT_DONTCHAIN;
1292         }
1293
1294         return retval;
1295 #if !defined(__i386__) && !defined(__x86_64__)
1296 bad:
1297         m_freem(m);
1298         return IPINPUT_FREED;
1299 #endif
1300 }
1301
1302 /*
1303  * Because the call to m_pullup() may freem the mbuf, the function frees the mbuf packet
1304  * chain before it return IP_CHECK_IF_DROP
1305  */
1306 static ip_check_if_result_t
1307 ip_input_check_interface(struct mbuf **mp, struct ip *ip, struct ifnet *inifp)
1308 {
1309         struct mbuf *m = *mp;
1310         struct in_ifaddr *ia = NULL;
1311         struct in_ifaddr *best_ia = NULL;
1312         struct ifnet *match_ifp = NULL;
1313         ip_check_if_result_t result = IP_CHECK_IF_NONE;
1314
1315         /*
1316          * Host broadcast and all network broadcast addresses are always a match
1317          */
1318         if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
1319             ip->ip_dst.s_addr == INADDR_ANY) {
1320                 ip_input_setdst_chain(m, inifp->if_index, NULL);
1321                 return IP_CHECK_IF_OURS;
1322         }
1323
1324         /*
1325          * Check for a match in the hash bucket.
1326          */
1327         lck_rw_lock_shared(in_ifaddr_rwlock);
1328         TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
1329                 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) {
1330                         best_ia = ia;
1331                         match_ifp = best_ia->ia_ifp;
1332
1333                         if (ia->ia_ifp == inifp || (inifp->if_flags & IFF_LOOPBACK) ||
1334                             (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1335                                 /*
1336                                  * A locally originated packet or packet from the loopback
1337                                  * interface is always an exact interface address match
1338                                  */
1339                                 match_ifp = inifp;
1340                                 break;
1341                         }
1342                         /*
1343                          * Continue the loop in case there's a exact match with another
1344                          * interface
1345                          */
1346                 }
1347         }
1348         if (best_ia != NULL) {
1349                 if (match_ifp != inifp && ipforwarding == 0 &&
1350                     ((ip_checkinterface == IP_CHECKINTERFACE_HYBRID_ES &&
1351                     (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
1352                     match_ifp->if_family == IFNET_FAMILY_UTUN)) ||
1353                     ip_checkinterface == IP_CHECKINTERFACE_STRONG_ES)) {
1354                         /*
1355                          * Drop when interface address check is strict and forwarding
1356                          * is disabled
1357                          */
1358                         result = IP_CHECK_IF_DROP;
1359                 } else {
1360                         result = IP_CHECK_IF_OURS;
1361                         ip_input_setdst_chain(m, 0, best_ia);
1362                 }
1363         }
1364         lck_rw_done(in_ifaddr_rwlock);
1365
1366         if (result == IP_CHECK_IF_NONE && (inifp->if_flags & IFF_BROADCAST)) {
1367                 /*
1368                  * Check for broadcast addresses.
1369                  *
1370                  * Only accept broadcast packets that arrive via the matching
1371                  * interface.  Reception of forwarded directed broadcasts would be
1372                  * handled via ip_forward() and ether_frameout() with the loopback
1373                  * into the stack for SIMPLEX interfaces handled by ether_frameout().
1374                  */
1375                 struct ifaddr *ifa;
1376
1377                 ifnet_lock_shared(inifp);
1378                 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
1379                         if (ifa->ifa_addr->sa_family != AF_INET) {
1380                                 continue;
1381                         }
1382                         ia = ifatoia(ifa);
1383                         if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr ||
1384                             ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
1385                                 ip_input_setdst_chain(m, 0, ia);
1386                                 result = IP_CHECK_IF_OURS;
1387                                 match_ifp = inifp;
1388                                 break;
1389                         }
1390                 }
1391                 ifnet_lock_done(inifp);
1392         }
1393
1394         /* Allow DHCP/BootP responses through */
1395         if (result == IP_CHECK_IF_NONE && (inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1396             ip->ip_p == IPPROTO_UDP && (IP_VHL_HL(ip->ip_vhl) << 2) == sizeof(struct ip)) {
1397                 struct udpiphdr *ui;
1398
1399                 if (m->m_len < sizeof(struct udpiphdr)) {
1400                         if ((m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
1401                                 OSAddAtomic(1, &udpstat.udps_hdrops);
1402                                 *mp = NULL;
1403                                 return IP_CHECK_IF_DROP;
1404                         }
1405                         /*
1406                          * m_pullup can return a different mbuf
1407                          */
1408                         *mp = m;
1409                         ip = mtod(m, struct ip *);
1410                 }
1411                 ui = mtod(m, struct udpiphdr *);
1412                 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1413                         ASSERT(m->m_nextpkt == NULL);
1414                         ip_setdstifaddr_info(m, inifp->if_index, NULL);
1415                         result = IP_CHECK_IF_OURS;
1416                         match_ifp = inifp;
1417                 }
1418         }
1419
1420         if (result == IP_CHECK_IF_NONE) {
1421                 if (ipforwarding == 0) {
1422                         result = IP_CHECK_IF_DROP;
1423                 } else {
1424                         result = IP_CHECK_IF_FORWARD;
1425                         ip_input_setdst_chain(m, inifp->if_index, NULL);
1426                 }
1427         }
1428
1429         if (result == IP_CHECK_IF_OURS && match_ifp != inifp) {
1430                 ipstat.ips_rcv_if_weak_match++;
1431
1432                 /*  Logging is too noisy when forwarding is enabled */
1433                 if (ip_checkinterface_debug != 0 && ipforwarding == 0) {
1434                         char src_str[MAX_IPv4_STR_LEN];
1435                         char dst_str[MAX_IPv4_STR_LEN];
1436
1437                         inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1438                         inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
1439                         os_log_info(OS_LOG_DEFAULT,
1440                             "%s: weak ES interface match to %s for packet from %s to %s proto %u received via %s",
1441                             __func__, best_ia->ia_ifp->if_xname, src_str, dst_str, ip->ip_p, inifp->if_xname);
1442                 }
1443         } else if (result == IP_CHECK_IF_DROP) {
1444                 if (ip_checkinterface_debug > 0) {
1445                         char src_str[MAX_IPv4_STR_LEN];
1446                         char dst_str[MAX_IPv4_STR_LEN];
1447
1448                         inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1449                         inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
1450                         os_log(OS_LOG_DEFAULT,
1451                             "%s: no interface match for packet from %s to %s proto %u received via %s",
1452                             __func__, src_str, dst_str, ip->ip_p, inifp->if_xname);
1453                 }
1454                 struct mbuf *tmp_mbuf = m;
1455                 while (tmp_mbuf != NULL) {
1456                         ipstat.ips_rcv_if_no_match++;
1457                         tmp_mbuf = tmp_mbuf->m_nextpkt;
1458                 }
1459                 m_freem_list(m);
1460                 *mp = NULL;
1461         }
1462
1463         return result;
1464 }
1465
1466 static void
1467 ip_input_second_pass(struct mbuf *m, struct ifnet *inifp,
1468     int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args)
1469 {
1470         struct mbuf             *tmp_mbuf = NULL;
1471         unsigned int            hlen;
1472
1473 #pragma unused (args)
1474
1475         struct ip *ip = mtod(m, struct ip *);
1476         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1477
1478         OSAddAtomic(npkts_in_chain, &ipstat.ips_total);
1479
1480         /*
1481          * Naively assume we can attribute inbound data to the route we would
1482          * use to send to this destination. Asymmetric routing breaks this
1483          * assumption, but it still allows us to account for traffic from
1484          * a remote node in the routing table.
1485          * this has a very significant performance impact so we bypass
1486          * if nstat_collect is disabled. We may also bypass if the
1487          * protocol is tcp in the future because tcp will have a route that
1488          * we can use to attribute the data to. That does mean we would not
1489          * account for forwarded tcp traffic.
1490          */
1491         ip_input_update_nstat(inifp, ip->ip_src, npkts_in_chain,
1492             bytes_in_chain);
1493
1494         /*
1495          * Check our list of addresses, to see if the packet is for us.
1496          * If we don't have any addresses, assume any unicast packet
1497          * we receive might be for us (and let the upper layers deal
1498          * with it).
1499          */
1500         tmp_mbuf = m;
1501         if (TAILQ_EMPTY(&in_ifaddrhead)) {
1502                 while (tmp_mbuf != NULL) {
1503                         if (!(tmp_mbuf->m_flags & (M_MCAST | M_BCAST))) {
1504                                 ip_setdstifaddr_info(tmp_mbuf, inifp->if_index,
1505                                     NULL);
1506                         }
1507                         tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
1508                 }
1509                 goto ours;
1510         }
1511
1512         /*
1513          * Enable a consistency check between the destination address
1514          * and the arrival interface for a unicast packet (the RFC 1122
1515          * strong ES model) if IP forwarding is disabled and the packet
1516          * is not locally generated
1517          *
1518          * XXX - Checking also should be disabled if the destination
1519          * address is ipnat'ed to a different interface.
1520          *
1521          * XXX - Checking is incompatible with IP aliases added
1522          * to the loopback interface instead of the interface where
1523          * the packets are received.
1524          */
1525         if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1526                 ip_check_if_result_t ip_check_if_result = IP_CHECK_IF_NONE;
1527
1528                 ip_check_if_result = ip_input_check_interface(&m, ip, inifp);
1529                 ASSERT(ip_check_if_result != IP_CHECK_IF_NONE);
1530                 if (ip_check_if_result == IP_CHECK_IF_OURS) {
1531                         goto ours;
1532                 } else if (ip_check_if_result == IP_CHECK_IF_DROP) {
1533                         return;
1534                 }
1535         } else {
1536                 struct in_multi *inm;
1537                 /*
1538                  * See if we belong to the destination multicast group on the
1539                  * arrival interface.
1540                  */
1541                 in_multihead_lock_shared();
1542                 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
1543                 in_multihead_lock_done();
1544                 if (inm == NULL) {
1545                         OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember);
1546                         m_freem_list(m);
1547                         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1548                         return;
1549                 }
1550                 ip_input_setdst_chain(m, inifp->if_index, NULL);
1551                 INM_REMREF(inm);
1552                 goto ours;
1553         }
1554
1555         tmp_mbuf = m;
1556         struct mbuf *nxt_mbuf = NULL;
1557         while (tmp_mbuf != NULL) {
1558                 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
1559                 /*
1560                  * Not for us; forward if possible and desirable.
1561                  */
1562                 mbuf_setnextpkt(tmp_mbuf, NULL);
1563                 if (ipforwarding == 0) {
1564                         OSAddAtomic(1, &ipstat.ips_cantforward);
1565                         m_freem(tmp_mbuf);
1566                 } else {
1567                         ip_forward(tmp_mbuf, 0, NULL);
1568                 }
1569                 tmp_mbuf = nxt_mbuf;
1570         }
1571         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1572         return;
1573 ours:
1574         ip = mtod(m, struct ip *); /* in case it changed */
1575         /*
1576          * If offset is set, must reassemble.
1577          */
1578         if (ip->ip_off & ~(IP_DF | IP_RF)) {
1579                 VERIFY(npkts_in_chain == 1);
1580                 m = ip_reass(m);
1581                 if (m == NULL) {
1582                         return;
1583                 }
1584                 ip = mtod(m, struct ip *);
1585                 /* Get the header length of the reassembled packet */
1586                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1587         }
1588
1589         /*
1590          * Further protocols expect the packet length to be w/o the
1591          * IP header.
1592          */
1593         ip->ip_len -= hlen;
1594
1595 #if IPSEC
1596         /*
1597          * enforce IPsec policy checking if we are seeing last header.
1598          * note that we do not visit this with protocols with pcb layer
1599          * code - like udp/tcp/raw ip.
1600          */
1601         if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
1602                 VERIFY(npkts_in_chain == 1);
1603                 if (ipsec4_in_reject(m, NULL)) {
1604                         IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1605                         goto bad;
1606                 }
1607         }
1608 #endif /* IPSEC */
1609
1610         /*
1611          * Switch out to protocol's input routine.
1612          */
1613         OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered);
1614
1615         ip_input_dispatch_chain(m);
1616
1617         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1618         return;
1619 bad:
1620         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1621         m_freem(m);
1622 }
1623
1624 void
1625 ip_input_process_list(struct mbuf *packet_list)
1626 {
1627         pktchain_elm_t  pktchain_tbl[PKTTBL_SZ];
1628
1629         struct mbuf     *packet = NULL;
1630         struct mbuf     *modm = NULL; /* modified mbuf */
1631         int             retval = 0;
1632 #if (DEBUG || DEVELOPMENT)
1633         struct timeval start_tv;
1634 #endif /* (DEBUG || DEVELOPMENT) */
1635         int     num_pkts = 0;
1636         int chain = 0;
1637         struct ip_fw_in_args       args;
1638
1639         if (ip_chaining == 0) {
1640                 struct mbuf *m = packet_list;
1641 #if (DEBUG || DEVELOPMENT)
1642                 if (ip_input_measure) {
1643                         net_perf_start_time(&net_perf, &start_tv);
1644                 }
1645 #endif /* (DEBUG || DEVELOPMENT) */
1646
1647                 while (m) {
1648                         packet_list = mbuf_nextpkt(m);
1649                         mbuf_setnextpkt(m, NULL);
1650                         ip_input(m);
1651                         m = packet_list;
1652                         num_pkts++;
1653                 }
1654 #if (DEBUG || DEVELOPMENT)
1655                 if (ip_input_measure) {
1656                         net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1657                 }
1658 #endif /* (DEBUG || DEVELOPMENT) */
1659                 return;
1660         }
1661 #if (DEBUG || DEVELOPMENT)
1662         if (ip_input_measure) {
1663                 net_perf_start_time(&net_perf, &start_tv);
1664         }
1665 #endif /* (DEBUG || DEVELOPMENT) */
1666
1667         bzero(&pktchain_tbl, sizeof(pktchain_tbl));
1668 restart_list_process:
1669         chain = 0;
1670         for (packet = packet_list; packet; packet = packet_list) {
1671                 packet_list = mbuf_nextpkt(packet);
1672                 mbuf_setnextpkt(packet, NULL);
1673
1674                 num_pkts++;
1675                 modm = NULL;
1676                 bzero(&args, sizeof(args));
1677
1678                 retval = ip_input_first_pass(packet, &args, &modm);
1679
1680                 if (retval == IPINPUT_DOCHAIN) {
1681                         if (modm) {
1682                                 packet = modm;
1683                         }
1684                         packet = ip_chain_insert(packet, &pktchain_tbl[0]);
1685                         if (packet == NULL) {
1686                                 ipstat.ips_rxc_chained++;
1687                                 chain++;
1688                                 if (chain > ip_chainsz) {
1689                                         break;
1690                                 }
1691                         } else {
1692                                 ipstat.ips_rxc_collisions++;
1693                                 break;
1694                         }
1695                 } else if (retval == IPINPUT_DONTCHAIN) {
1696                         /* in order to preserve order, exit from chaining */
1697                         if (modm) {
1698                                 packet = modm;
1699                         }
1700                         ipstat.ips_rxc_notchain++;
1701                         break;
1702                 } else {
1703                         /* packet was freed or delivered, do nothing. */
1704                 }
1705         }
1706
1707         /* do second pass here for pktchain_tbl */
1708         if (chain) {
1709                 ip_input_second_pass_loop_tbl(&pktchain_tbl[0], &args);
1710         }
1711
1712         if (packet) {
1713                 /*
1714                  * equivalent update in chaining case if performed in
1715                  * ip_input_second_pass_loop_tbl().
1716                  */
1717 #if (DEBUG || DEVELOPMENT)
1718                 if (ip_input_measure) {
1719                         net_perf_histogram(&net_perf, 1);
1720                 }
1721 #endif /* (DEBUG || DEVELOPMENT) */
1722                 ip_input_second_pass(packet, packet->m_pkthdr.rcvif,
1723                     1, packet->m_pkthdr.len, &args);
1724         }
1725
1726         if (packet_list) {
1727                 goto restart_list_process;
1728         }
1729
1730 #if (DEBUG || DEVELOPMENT)
1731         if (ip_input_measure) {
1732                 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1733         }
1734 #endif /* (DEBUG || DEVELOPMENT) */
1735 }
1736 /*
1737  * Ip input routine.  Checksum and byte swap header.  If fragmented
1738  * try to reassemble.  Process options.  Pass to next level.
1739  */
1740 void
1741 ip_input(struct mbuf *m)
1742 {
1743         struct ip *ip;
1744         unsigned int hlen;
1745         u_short sum = 0;
1746 #if DUMMYNET
1747         struct ip_fw_args args;
1748         struct m_tag    *tag;
1749 #endif
1750         ipfilter_t inject_filter_ref = NULL;
1751         struct ifnet *inifp;
1752
1753         /* Check if the mbuf is still valid after interface filter processing */
1754         MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
1755         inifp = m->m_pkthdr.rcvif;
1756         VERIFY(inifp != NULL);
1757
1758         ipstat.ips_rxc_notlist++;
1759
1760         /* Perform IP header alignment fixup, if needed */
1761         IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
1762
1763         m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
1764
1765 #if DUMMYNET
1766         bzero(&args, sizeof(struct ip_fw_args));
1767
1768         /*
1769          * Don't bother searching for tag(s) if there's none.
1770          */
1771         if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
1772                 goto ipfw_tags_done;
1773         }
1774
1775         /* Grab info from mtags prepended to the chain */
1776         if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1777             KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
1778                 struct dn_pkt_tag *dn_tag;
1779
1780                 dn_tag = (struct dn_pkt_tag *)(tag + 1);
1781                 args.fwa_pf_rule = dn_tag->dn_pf_rule;
1782
1783                 m_tag_delete(m, tag);
1784         }
1785
1786 #if DIAGNOSTIC
1787         if (m == NULL || !(m->m_flags & M_PKTHDR)) {
1788                 panic("ip_input no HDR");
1789         }
1790 #endif
1791
1792         if (args.fwa_pf_rule) {
1793                 /* dummynet already filtered us */
1794                 ip = mtod(m, struct ip *);
1795                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1796                 inject_filter_ref = ipf_get_inject_filter(m);
1797                 if (args.fwa_pf_rule) {
1798                         goto check_with_pf;
1799                 }
1800         }
1801 ipfw_tags_done:
1802 #endif /* DUMMYNET */
1803
1804         /*
1805          * No need to process packet twice if we've already seen it.
1806          */
1807         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1808                 inject_filter_ref = ipf_get_inject_filter(m);
1809         }
1810         if (inject_filter_ref != NULL) {
1811                 ip = mtod(m, struct ip *);
1812                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1813
1814                 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1815                     struct ip *, ip, struct ifnet *, inifp,
1816                     struct ip *, ip, struct ip6_hdr *, NULL);
1817
1818                 ip->ip_len = ntohs(ip->ip_len) - hlen;
1819                 ip->ip_off = ntohs(ip->ip_off);
1820                 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1821                 return;
1822         }
1823
1824         OSAddAtomic(1, &ipstat.ips_total);
1825         if (m->m_pkthdr.len < sizeof(struct ip)) {
1826                 goto tooshort;
1827         }
1828
1829         if (m->m_len < sizeof(struct ip) &&
1830             (m = m_pullup(m, sizeof(struct ip))) == NULL) {
1831                 OSAddAtomic(1, &ipstat.ips_toosmall);
1832                 return;
1833         }
1834         ip = mtod(m, struct ip *);
1835
1836         KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1837             ip->ip_p, ip->ip_off, ip->ip_len);
1838
1839         if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1840                 OSAddAtomic(1, &ipstat.ips_badvers);
1841                 goto bad;
1842         }
1843
1844         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1845         if (hlen < sizeof(struct ip)) {         /* minimum header length */
1846                 OSAddAtomic(1, &ipstat.ips_badhlen);
1847                 goto bad;
1848         }
1849         if (hlen > m->m_len) {
1850                 if ((m = m_pullup(m, hlen)) == NULL) {
1851                         OSAddAtomic(1, &ipstat.ips_badhlen);
1852                         return;
1853                 }
1854                 ip = mtod(m, struct ip *);
1855         }
1856
1857         /* 127/8 must not appear on wire - RFC1122 */
1858         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1859             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1860                 /*
1861                  * Allow for the following exceptions:
1862                  *
1863                  *   1. If the packet was sent to loopback (i.e. rcvif
1864                  *      would have been set earlier at output time.)
1865                  *
1866                  *   2. If the packet was sent out on loopback from a local
1867                  *      source address which belongs to a non-loopback
1868                  *      interface (i.e. rcvif may not necessarily be a
1869                  *      loopback interface, hence the test for PKTF_LOOP.)
1870                  *      Unlike IPv6, there is no interface scope ID, and
1871                  *      therefore we don't care so much about PKTF_IFINFO.
1872                  */
1873                 if (!(inifp->if_flags & IFF_LOOPBACK) &&
1874                     !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1875                         OSAddAtomic(1, &ipstat.ips_badaddr);
1876                         goto bad;
1877                 }
1878         }
1879
1880         /* IPv4 Link-Local Addresses as defined in RFC3927 */
1881         if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
1882             IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1883                 ip_linklocal_stat.iplls_in_total++;
1884                 if (ip->ip_ttl != MAXTTL) {
1885                         OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
1886                         /* Silently drop link local traffic with bad TTL */
1887                         if (!ip_linklocal_in_allowbadttl) {
1888                                 goto bad;
1889                         }
1890                 }
1891         }
1892
1893         sum = ip_cksum(m, hlen);
1894         if (sum) {
1895                 goto bad;
1896         }
1897
1898         DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1899             struct ip *, ip, struct ifnet *, inifp,
1900             struct ip *, ip, struct ip6_hdr *, NULL);
1901
1902         /*
1903          * Naively assume we can attribute inbound data to the route we would
1904          * use to send to this destination. Asymmetric routing breaks this
1905          * assumption, but it still allows us to account for traffic from
1906          * a remote node in the routing table.
1907          * this has a very significant performance impact so we bypass
1908          * if nstat_collect is disabled. We may also bypass if the
1909          * protocol is tcp in the future because tcp will have a route that
1910          * we can use to attribute the data to. That does mean we would not
1911          * account for forwarded tcp traffic.
1912          */
1913         if (nstat_collect) {
1914                 struct rtentry *rt =
1915                     ifnet_cached_rtlookup_inet(inifp, ip->ip_src);
1916                 if (rt != NULL) {
1917                         nstat_route_rx(rt, 1, m->m_pkthdr.len, 0);
1918                         rtfree(rt);
1919                 }
1920         }
1921
1922         /*
1923          * Convert fields to host representation.
1924          */
1925 #if BYTE_ORDER != BIG_ENDIAN
1926         NTOHS(ip->ip_len);
1927 #endif
1928
1929         if (ip->ip_len < hlen) {
1930                 OSAddAtomic(1, &ipstat.ips_badlen);
1931                 goto bad;
1932         }
1933
1934 #if BYTE_ORDER != BIG_ENDIAN
1935         NTOHS(ip->ip_off);
1936 #endif
1937         /*
1938          * Check that the amount of data in the buffers
1939          * is as at least much as the IP header would have us expect.
1940          * Trim mbufs if longer than we expect.
1941          * Drop packet if shorter than we expect.
1942          */
1943         if (m->m_pkthdr.len < ip->ip_len) {
1944 tooshort:
1945                 OSAddAtomic(1, &ipstat.ips_tooshort);
1946                 goto bad;
1947         }
1948         if (m->m_pkthdr.len > ip->ip_len) {
1949                 ip_input_adjust(m, ip, inifp);
1950         }
1951
1952 #if DUMMYNET
1953 check_with_pf:
1954 #endif
1955 #if PF
1956         /* Invoke inbound packet filter */
1957         if (PF_IS_ENABLED) {
1958                 int error;
1959 #if DUMMYNET
1960                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args);
1961 #else
1962                 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1963 #endif /* DUMMYNET */
1964                 if (error != 0 || m == NULL) {
1965                         if (m != NULL) {
1966                                 panic("%s: unexpected packet %p\n",
1967                                     __func__, m);
1968                                 /* NOTREACHED */
1969                         }
1970                         /* Already freed by callee */
1971                         return;
1972                 }
1973                 ip = mtod(m, struct ip *);
1974                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1975         }
1976 #endif /* PF */
1977
1978 #if IPSEC
1979         if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
1980                 goto pass;
1981         }
1982 #endif
1983
1984 pass:
1985         /*
1986          * Process options and, if not destined for us,
1987          * ship it on.  ip_dooptions returns 1 when an
1988          * error was detected (causing an icmp message
1989          * to be sent and the original packet to be freed).
1990          */
1991         ip_nhops = 0;           /* for source routed packets */
1992         if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
1993                 return;
1994         }
1995
1996         /*
1997          * Check our list of addresses, to see if the packet is for us.
1998          * If we don't have any addresses, assume any unicast packet
1999          * we receive might be for us (and let the upper layers deal
2000          * with it).
2001          */
2002         if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST | M_BCAST))) {
2003                 ip_setdstifaddr_info(m, inifp->if_index, NULL);
2004                 goto ours;
2005         }
2006
2007         /*
2008          * Enable a consistency check between the destination address
2009          * and the arrival interface for a unicast packet (the RFC 1122
2010          * strong ES model) if IP forwarding is disabled and the packet
2011          * is not locally generated and the packet is not subject to
2012          * 'ipfw fwd'.
2013          *
2014          * XXX - Checking also should be disabled if the destination
2015          * address is ipnat'ed to a different interface.
2016          *
2017          * XXX - Checking is incompatible with IP aliases added
2018          * to the loopback interface instead of the interface where
2019          * the packets are received.
2020          */
2021         if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
2022                 ip_check_if_result_t check_if_result = IP_CHECK_IF_NONE;
2023
2024                 check_if_result = ip_input_check_interface(&m, ip, inifp);
2025                 ASSERT(check_if_result != IP_CHECK_IF_NONE);
2026                 if (check_if_result == IP_CHECK_IF_OURS) {
2027                         goto ours;
2028                 } else if (check_if_result == IP_CHECK_IF_DROP) {
2029                         return;
2030                 }
2031         } else {
2032                 struct in_multi *inm;
2033                 /*
2034                  * See if we belong to the destination multicast group on the
2035                  * arrival interface.
2036                  */
2037                 in_multihead_lock_shared();
2038                 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
2039                 in_multihead_lock_done();
2040                 if (inm == NULL) {
2041                         OSAddAtomic(1, &ipstat.ips_notmember);
2042                         m_freem(m);
2043                         return;
2044                 }
2045                 ip_setdstifaddr_info(m, inifp->if_index, NULL);
2046                 INM_REMREF(inm);
2047                 goto ours;
2048         }
2049
2050         /*
2051          * Not for us; forward if possible and desirable.
2052          */
2053         if (ipforwarding == 0) {
2054                 OSAddAtomic(1, &ipstat.ips_cantforward);
2055                 m_freem(m);
2056         } else {
2057                 ip_forward(m, 0, NULL);
2058         }
2059         return;
2060
2061 ours:
2062         /*
2063          * If offset or IP_MF are set, must reassemble.
2064          */
2065         if (ip->ip_off & ~(IP_DF | IP_RF)) {
2066                 m = ip_reass(m);
2067                 if (m == NULL) {
2068                         return;
2069                 }
2070                 ip = mtod(m, struct ip *);
2071                 /* Get the header length of the reassembled packet */
2072                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2073         }
2074
2075         /*
2076          * Further protocols expect the packet length to be w/o the
2077          * IP header.
2078          */
2079         ip->ip_len -= hlen;
2080
2081
2082 #if IPSEC
2083         /*
2084          * enforce IPsec policy checking if we are seeing last header.
2085          * note that we do not visit this with protocols with pcb layer
2086          * code - like udp/tcp/raw ip.
2087          */
2088         if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
2089                 if (ipsec4_in_reject(m, NULL)) {
2090                         IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
2091                         goto bad;
2092                 }
2093         }
2094 #endif /* IPSEC */
2095
2096         /*
2097          * Switch out to protocol's input routine.
2098          */
2099         OSAddAtomic(1, &ipstat.ips_delivered);
2100
2101         ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2102         return;
2103
2104 bad:
2105         KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
2106         m_freem(m);
2107 }
2108
2109 static void
2110 ipq_updateparams(void)
2111 {
2112         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2113         /*
2114          * -1 for unlimited allocation.
2115          */
2116         if (maxnipq < 0) {
2117                 ipq_limit = 0;
2118         }
2119         /*
2120          * Positive number for specific bound.
2121          */
2122         if (maxnipq > 0) {
2123                 ipq_limit = maxnipq;
2124         }
2125         /*
2126          * Zero specifies no further fragment queue allocation -- set the
2127          * bound very low, but rely on implementation elsewhere to actually
2128          * prevent allocation and reclaim current queues.
2129          */
2130         if (maxnipq == 0) {
2131                 ipq_limit = 1;
2132         }
2133         /*
2134          * Arm the purge timer if not already and if there's work to do
2135          */
2136         frag_sched_timeout();
2137 }
2138
2139 static int
2140 sysctl_maxnipq SYSCTL_HANDLER_ARGS
2141 {
2142 #pragma unused(arg1, arg2)
2143         int error, i;
2144
2145         lck_mtx_lock(&ipqlock);
2146         i = maxnipq;
2147         error = sysctl_handle_int(oidp, &i, 0, req);
2148         if (error || req->newptr == USER_ADDR_NULL) {
2149                 goto done;
2150         }
2151         /* impose bounds */
2152         if (i < -1 || i > (nmbclusters / 4)) {
2153                 error = EINVAL;
2154                 goto done;
2155         }
2156         maxnipq = i;
2157         ipq_updateparams();
2158 done:
2159         lck_mtx_unlock(&ipqlock);
2160         return error;
2161 }
2162
2163 static int
2164 sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS
2165 {
2166 #pragma unused(arg1, arg2)
2167         int error, i;
2168
2169         lck_mtx_lock(&ipqlock);
2170         i = maxfragsperpacket;
2171         error = sysctl_handle_int(oidp, &i, 0, req);
2172         if (error || req->newptr == USER_ADDR_NULL) {
2173                 goto done;
2174         }
2175         maxfragsperpacket = i;
2176         ipq_updateparams();     /* see if we need to arm timer */
2177 done:
2178         lck_mtx_unlock(&ipqlock);
2179         return error;
2180 }
2181
2182 /*
2183  * Take incoming datagram fragment and try to reassemble it into
2184  * whole datagram.  If a chain for reassembly of this datagram already
2185  * exists, then it is given as fp; otherwise have to make a chain.
2186  *
2187  * The IP header is *NOT* adjusted out of iplen (but in host byte order).
2188  */
2189 static struct mbuf *
2190 ip_reass(struct mbuf *m)
2191 {
2192         struct ip *ip;
2193         struct mbuf *p, *q, *nq, *t;
2194         struct ipq *fp = NULL;
2195         struct ipqhead *head;
2196         int i, hlen, next;
2197         u_int8_t ecn, ecn0;
2198         uint32_t csum, csum_flags;
2199         uint16_t hash;
2200         struct fq_head dfq;
2201
2202         MBUFQ_INIT(&dfq);       /* for deferred frees */
2203
2204         /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */
2205         if (maxnipq == 0 || maxfragsperpacket == 0) {
2206                 ipstat.ips_fragments++;
2207                 ipstat.ips_fragdropped++;
2208                 m_freem(m);
2209                 if (nipq > 0) {
2210                         lck_mtx_lock(&ipqlock);
2211                         frag_sched_timeout();   /* purge stale fragments */
2212                         lck_mtx_unlock(&ipqlock);
2213                 }
2214                 return NULL;
2215         }
2216
2217         ip = mtod(m, struct ip *);
2218         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2219
2220         lck_mtx_lock(&ipqlock);
2221
2222         hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
2223         head = &ipq[hash];
2224
2225         /*
2226          * Look for queue of fragments
2227          * of this datagram.
2228          */
2229         TAILQ_FOREACH(fp, head, ipq_list) {
2230                 if (ip->ip_id == fp->ipq_id &&
2231                     ip->ip_src.s_addr == fp->ipq_src.s_addr &&
2232                     ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
2233                     ip->ip_p == fp->ipq_p) {
2234                         goto found;
2235                 }
2236         }
2237
2238         fp = NULL;
2239
2240         /*
2241          * Attempt to trim the number of allocated fragment queues if it
2242          * exceeds the administrative limit.
2243          */
2244         if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) {
2245                 /*
2246                  * drop something from the tail of the current queue
2247                  * before proceeding further
2248                  */
2249                 struct ipq *fq = TAILQ_LAST(head, ipqhead);
2250                 if (fq == NULL) {   /* gak */
2251                         for (i = 0; i < IPREASS_NHASH; i++) {
2252                                 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
2253                                 if (r) {
2254                                         ipstat.ips_fragtimeout += r->ipq_nfrags;
2255                                         frag_freef(&ipq[i], r);
2256                                         break;
2257                                 }
2258                         }
2259                 } else {
2260                         ipstat.ips_fragtimeout += fq->ipq_nfrags;
2261                         frag_freef(head, fq);
2262                 }
2263         }
2264
2265 found:
2266         /*
2267          * Leverage partial checksum offload for IP fragments.  Narrow down
2268          * the scope to cover only UDP without IP options, as that is the
2269          * most common case.
2270          *
2271          * Perform 1's complement adjustment of octets that got included/
2272          * excluded in the hardware-calculated checksum value.  Ignore cases
2273          * where the value includes the entire IPv4 header span, as the sum
2274          * for those octets would already be 0 by the time we get here; IP
2275          * has already performed its header checksum validation.  Also take
2276          * care of any trailing bytes and subtract out their partial sum.
2277          */
2278         if (ip->ip_p == IPPROTO_UDP && hlen == sizeof(struct ip) &&
2279             (m->m_pkthdr.csum_flags &
2280             (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
2281             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
2282                 uint32_t start = m->m_pkthdr.csum_rx_start;
2283                 int32_t trailer = (m_pktlen(m) - ip->ip_len);
2284                 uint32_t swbytes = (uint32_t)trailer;
2285
2286                 csum = m->m_pkthdr.csum_rx_val;
2287
2288                 ASSERT(trailer >= 0);
2289                 if ((start != 0 && start != hlen) || trailer != 0) {
2290                         uint32_t datalen = ip->ip_len - hlen;
2291
2292 #if BYTE_ORDER != BIG_ENDIAN
2293                         if (start < hlen) {
2294                                 HTONS(ip->ip_len);
2295                                 HTONS(ip->ip_off);
2296                         }
2297 #endif /* BYTE_ORDER != BIG_ENDIAN */
2298                         /* callee folds in sum */
2299                         csum = m_adj_sum16(m, start, hlen, datalen, csum);
2300                         if (hlen > start) {
2301                                 swbytes += (hlen - start);
2302                         } else {
2303                                 swbytes += (start - hlen);
2304                         }
2305 #if BYTE_ORDER != BIG_ENDIAN
2306                         if (start < hlen) {
2307                                 NTOHS(ip->ip_off);
2308                                 NTOHS(ip->ip_len);
2309                         }
2310 #endif /* BYTE_ORDER != BIG_ENDIAN */
2311                 }
2312                 csum_flags = m->m_pkthdr.csum_flags;
2313
2314                 if (swbytes != 0) {
2315                         udp_in_cksum_stats(swbytes);
2316                 }
2317                 if (trailer != 0) {
2318                         m_adj(m, -trailer);
2319                 }
2320         } else {
2321                 csum = 0;
2322                 csum_flags = 0;
2323         }
2324
2325         /* Invalidate checksum */
2326         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
2327
2328         ipstat.ips_fragments++;
2329
2330         /*
2331          * Adjust ip_len to not reflect header,
2332          * convert offset of this to bytes.
2333          */
2334         ip->ip_len -= hlen;
2335         if (ip->ip_off & IP_MF) {
2336                 /*
2337                  * Make sure that fragments have a data length
2338                  * that's a non-zero multiple of 8 bytes.
2339                  */
2340                 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
2341                         OSAddAtomic(1, &ipstat.ips_toosmall);
2342                         /*
2343                          * Reassembly queue may have been found if previous
2344                          * fragments were valid; given that this one is bad,
2345                          * we need to drop it.  Make sure to set fp to NULL
2346                          * if not already, since we don't want to decrement
2347                          * ipq_nfrags as it doesn't include this packet.
2348                          */
2349                         fp = NULL;
2350                         goto dropfrag;
2351                 }
2352                 m->m_flags |= M_FRAG;
2353         } else {
2354                 /* Clear the flag in case packet comes from loopback */
2355                 m->m_flags &= ~M_FRAG;
2356         }
2357         ip->ip_off <<= 3;
2358
2359         m->m_pkthdr.pkt_hdr = ip;
2360
2361         /* Previous ip_reass() started here. */
2362         /*
2363          * Presence of header sizes in mbufs
2364          * would confuse code below.
2365          */
2366         m->m_data += hlen;
2367         m->m_len -= hlen;
2368
2369         /*
2370          * If first fragment to arrive, create a reassembly queue.
2371          */
2372         if (fp == NULL) {
2373                 fp = ipq_alloc(M_DONTWAIT);
2374                 if (fp == NULL) {
2375                         goto dropfrag;
2376                 }
2377                 TAILQ_INSERT_HEAD(head, fp, ipq_list);
2378                 nipq++;
2379                 fp->ipq_nfrags = 1;
2380                 fp->ipq_ttl = IPFRAGTTL;
2381                 fp->ipq_p = ip->ip_p;
2382                 fp->ipq_id = ip->ip_id;
2383                 fp->ipq_src = ip->ip_src;
2384                 fp->ipq_dst = ip->ip_dst;
2385                 fp->ipq_frags = m;
2386                 m->m_nextpkt = NULL;
2387                 /*
2388                  * If the first fragment has valid checksum offload
2389                  * info, the rest of fragments are eligible as well.
2390                  */
2391                 if (csum_flags != 0) {
2392                         fp->ipq_csum = csum;
2393                         fp->ipq_csum_flags = csum_flags;
2394                 }
2395                 m = NULL;       /* nothing to return */
2396                 goto done;
2397         } else {
2398                 fp->ipq_nfrags++;
2399         }
2400
2401 #define GETIP(m)        ((struct ip *)((m)->m_pkthdr.pkt_hdr))
2402
2403         /*
2404          * Handle ECN by comparing this segment with the first one;
2405          * if CE is set, do not lose CE.
2406          * drop if CE and not-ECT are mixed for the same packet.
2407          */
2408         ecn = ip->ip_tos & IPTOS_ECN_MASK;
2409         ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
2410         if (ecn == IPTOS_ECN_CE) {
2411                 if (ecn0 == IPTOS_ECN_NOTECT) {
2412                         goto dropfrag;
2413                 }
2414                 if (ecn0 != IPTOS_ECN_CE) {
2415                         GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
2416                 }
2417         }
2418         if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
2419                 goto dropfrag;
2420         }
2421
2422         /*
2423          * Find a segment which begins after this one does.
2424          */
2425         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2426                 if (GETIP(q)->ip_off > ip->ip_off) {
2427                         break;
2428                 }
2429         }
2430
2431         /*
2432          * If there is a preceding segment, it may provide some of
2433          * our data already.  If so, drop the data from the incoming
2434          * segment.  If it provides all of our data, drop us, otherwise
2435          * stick new segment in the proper place.
2436          *
2437          * If some of the data is dropped from the preceding
2438          * segment, then it's checksum is invalidated.
2439          */
2440         if (p) {
2441                 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
2442                 if (i > 0) {
2443                         if (i >= ip->ip_len) {
2444                                 goto dropfrag;
2445                         }
2446                         m_adj(m, i);
2447                         fp->ipq_csum_flags = 0;
2448                         ip->ip_off += i;
2449                         ip->ip_len -= i;
2450                 }
2451                 m->m_nextpkt = p->m_nextpkt;
2452                 p->m_nextpkt = m;
2453         } else {
2454                 m->m_nextpkt = fp->ipq_frags;
2455                 fp->ipq_frags = m;
2456         }
2457
2458         /*
2459          * While we overlap succeeding segments trim them or,
2460          * if they are completely covered, dequeue them.
2461          */
2462         for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
2463             q = nq) {
2464                 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
2465                 if (i < GETIP(q)->ip_len) {
2466                         GETIP(q)->ip_len -= i;
2467                         GETIP(q)->ip_off += i;
2468                         m_adj(q, i);
2469                         fp->ipq_csum_flags = 0;
2470                         break;
2471                 }
2472                 nq = q->m_nextpkt;
2473                 m->m_nextpkt = nq;
2474                 ipstat.ips_fragdropped++;
2475                 fp->ipq_nfrags--;
2476                 /* defer freeing until after lock is dropped */
2477                 MBUFQ_ENQUEUE(&dfq, q);
2478         }
2479
2480         /*
2481          * If this fragment contains similar checksum offload info
2482          * as that of the existing ones, accumulate checksum.  Otherwise,
2483          * invalidate checksum offload info for the entire datagram.
2484          */
2485         if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags) {
2486                 fp->ipq_csum += csum;
2487         } else if (fp->ipq_csum_flags != 0) {
2488                 fp->ipq_csum_flags = 0;
2489         }
2490
2491
2492         /*
2493          * Check for complete reassembly and perform frag per packet
2494          * limiting.
2495          *
2496          * Frag limiting is performed here so that the nth frag has
2497          * a chance to complete the packet before we drop the packet.
2498          * As a result, n+1 frags are actually allowed per packet, but
2499          * only n will ever be stored. (n = maxfragsperpacket.)
2500          *
2501          */
2502         next = 0;
2503         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2504                 if (GETIP(q)->ip_off != next) {
2505                         if (fp->ipq_nfrags > maxfragsperpacket) {
2506                                 ipstat.ips_fragdropped += fp->ipq_nfrags;
2507                                 frag_freef(head, fp);
2508                         }
2509                         m = NULL;       /* nothing to return */
2510                         goto done;
2511                 }
2512                 next += GETIP(q)->ip_len;
2513         }
2514         /* Make sure the last packet didn't have the IP_MF flag */
2515         if (p->m_flags & M_FRAG) {
2516                 if (fp->ipq_nfrags > maxfragsperpacket) {
2517                         ipstat.ips_fragdropped += fp->ipq_nfrags;
2518                         frag_freef(head, fp);
2519                 }
2520                 m = NULL;               /* nothing to return */
2521                 goto done;
2522         }
2523
2524         /*
2525          * Reassembly is complete.  Make sure the packet is a sane size.
2526          */
2527         q = fp->ipq_frags;
2528         ip = GETIP(q);
2529         if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
2530                 ipstat.ips_toolong++;
2531                 ipstat.ips_fragdropped += fp->ipq_nfrags;
2532                 frag_freef(head, fp);
2533                 m = NULL;               /* nothing to return */
2534                 goto done;
2535         }
2536
2537         /*
2538          * Concatenate fragments.
2539          */
2540         m = q;
2541         t = m->m_next;
2542         m->m_next = NULL;
2543         m_cat(m, t);
2544         nq = q->m_nextpkt;
2545         q->m_nextpkt = NULL;
2546         for (q = nq; q != NULL; q = nq) {
2547                 nq = q->m_nextpkt;
2548                 q->m_nextpkt = NULL;
2549                 m_cat(m, q);
2550         }
2551
2552         /*
2553          * Store partial hardware checksum info from the fragment queue;
2554          * the receive start offset is set to 20 bytes (see code at the
2555          * top of this routine.)
2556          */
2557         if (fp->ipq_csum_flags != 0) {
2558                 csum = fp->ipq_csum;
2559
2560                 ADDCARRY(csum);
2561
2562                 m->m_pkthdr.csum_rx_val = csum;
2563                 m->m_pkthdr.csum_rx_start = sizeof(struct ip);
2564                 m->m_pkthdr.csum_flags = fp->ipq_csum_flags;
2565         } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
2566             (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2567                 /* loopback checksums are always OK */
2568                 m->m_pkthdr.csum_data = 0xffff;
2569                 m->m_pkthdr.csum_flags =
2570                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
2571                     CSUM_IP_CHECKED | CSUM_IP_VALID;
2572         }
2573
2574         /*
2575          * Create header for new ip packet by modifying header of first
2576          * packet; dequeue and discard fragment reassembly header.
2577          * Make header visible.
2578          */
2579         ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next;
2580         ip->ip_src = fp->ipq_src;
2581         ip->ip_dst = fp->ipq_dst;
2582
2583         fp->ipq_frags = NULL;   /* return to caller as 'm' */
2584         frag_freef(head, fp);
2585         fp = NULL;
2586
2587         m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
2588         m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
2589         /* some debugging cruft by sklower, below, will go away soon */
2590         if (m->m_flags & M_PKTHDR) {    /* XXX this should be done elsewhere */
2591                 m_fixhdr(m);
2592         }
2593         ipstat.ips_reassembled++;
2594
2595         /* arm the purge timer if not already and if there's work to do */
2596         frag_sched_timeout();
2597         lck_mtx_unlock(&ipqlock);
2598         /* perform deferred free (if needed) now that lock is dropped */
2599         if (!MBUFQ_EMPTY(&dfq)) {
2600                 MBUFQ_DRAIN(&dfq);
2601         }
2602         VERIFY(MBUFQ_EMPTY(&dfq));
2603         return m;
2604
2605 done:
2606         VERIFY(m == NULL);
2607         /* arm the purge timer if not already and if there's work to do */
2608         frag_sched_timeout();
2609         lck_mtx_unlock(&ipqlock);
2610         /* perform deferred free (if needed) */
2611         if (!MBUFQ_EMPTY(&dfq)) {
2612                 MBUFQ_DRAIN(&dfq);
2613         }
2614         VERIFY(MBUFQ_EMPTY(&dfq));
2615         return NULL;
2616
2617 dropfrag:
2618         ipstat.ips_fragdropped++;
2619         if (fp != NULL) {
2620                 fp->ipq_nfrags--;
2621         }
2622         /* arm the purge timer if not already and if there's work to do */
2623         frag_sched_timeout();
2624         lck_mtx_unlock(&ipqlock);
2625         m_freem(m);
2626         /* perform deferred free (if needed) */
2627         if (!MBUFQ_EMPTY(&dfq)) {
2628                 MBUFQ_DRAIN(&dfq);
2629         }
2630         VERIFY(MBUFQ_EMPTY(&dfq));
2631         return NULL;
2632 #undef GETIP
2633 }
2634
2635 /*
2636  * Free a fragment reassembly header and all
2637  * associated datagrams.
2638  */
2639 static void
2640 frag_freef(struct ipqhead *fhp, struct ipq *fp)
2641 {
2642         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2643
2644         fp->ipq_nfrags = 0;
2645         if (fp->ipq_frags != NULL) {
2646                 m_freem_list(fp->ipq_frags);
2647                 fp->ipq_frags = NULL;
2648         }
2649         TAILQ_REMOVE(fhp, fp, ipq_list);
2650         nipq--;
2651         ipq_free(fp);
2652 }
2653
2654 /*
2655  * IP reassembly timer processing
2656  */
2657 static void
2658 frag_timeout(void *arg)
2659 {
2660 #pragma unused(arg)
2661         struct ipq *fp;
2662         int i;
2663
2664         /*
2665          * Update coarse-grained networking timestamp (in sec.); the idea
2666          * is to piggy-back on the timeout callout to update the counter
2667          * returnable via net_uptime().
2668          */
2669         net_update_uptime();
2670
2671         lck_mtx_lock(&ipqlock);
2672         for (i = 0; i < IPREASS_NHASH; i++) {
2673                 for (fp = TAILQ_FIRST(&ipq[i]); fp;) {
2674                         struct ipq *fpp;
2675
2676                         fpp = fp;
2677                         fp = TAILQ_NEXT(fp, ipq_list);
2678                         if (--fpp->ipq_ttl == 0) {
2679                                 ipstat.ips_fragtimeout += fpp->ipq_nfrags;
2680                                 frag_freef(&ipq[i], fpp);
2681                         }
2682                 }
2683         }
2684         /*
2685          * If we are over the maximum number of fragments
2686          * (due to the limit being lowered), drain off
2687          * enough to get down to the new limit.
2688          */
2689         if (maxnipq >= 0 && nipq > (unsigned)maxnipq) {
2690                 for (i = 0; i < IPREASS_NHASH; i++) {
2691                         while (nipq > (unsigned)maxnipq &&
2692                             !TAILQ_EMPTY(&ipq[i])) {
2693                                 ipstat.ips_fragdropped +=
2694                                     TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2695                                 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
2696                         }
2697                 }
2698         }
2699         /* re-arm the purge timer if there's work to do */
2700         frag_timeout_run = 0;
2701         frag_sched_timeout();
2702         lck_mtx_unlock(&ipqlock);
2703 }
2704
2705 static void
2706 frag_sched_timeout(void)
2707 {
2708         LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2709
2710         if (!frag_timeout_run && nipq > 0) {
2711                 frag_timeout_run = 1;
2712                 timeout(frag_timeout, NULL, hz);
2713         }
2714 }
2715
2716 /*
2717  * Drain off all datagram fragments.
2718  */
2719 static void
2720 frag_drain(void)
2721 {
2722         int i;
2723
2724         lck_mtx_lock(&ipqlock);
2725         for (i = 0; i < IPREASS_NHASH; i++) {
2726                 while (!TAILQ_EMPTY(&ipq[i])) {
2727                         ipstat.ips_fragdropped +=
2728                             TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2729                         frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
2730                 }
2731         }
2732         lck_mtx_unlock(&ipqlock);
2733 }
2734
2735 static struct ipq *
2736 ipq_alloc(int how)
2737 {
2738         struct mbuf *t;
2739         struct ipq *fp;
2740
2741         /*
2742          * See comments in ipq_updateparams().  Keep the count separate
2743          * from nipq since the latter represents the elements already
2744          * in the reassembly queues.
2745          */
2746         if (ipq_limit > 0 && ipq_count > ipq_limit) {
2747                 return NULL;
2748         }
2749
2750         t = m_get(how, MT_FTABLE);
2751         if (t != NULL) {
2752                 atomic_add_32(&ipq_count, 1);
2753                 fp = mtod(t, struct ipq *);
2754                 bzero(fp, sizeof(*fp));
2755         } else {
2756                 fp = NULL;
2757         }
2758         return fp;
2759 }
2760
2761 static void
2762 ipq_free(struct ipq *fp)
2763 {
2764         (void) m_free(dtom(fp));
2765         atomic_add_32(&ipq_count, -1);
2766 }
2767
2768 /*
2769  * Drain callback
2770  */
2771 void
2772 ip_drain(void)
2773 {
2774         frag_drain();           /* fragments */
2775         in_rtqdrain();          /* protocol cloned routes */
2776         in_arpdrain(NULL);      /* cloned routes: ARP */
2777 }
2778
2779 /*
2780  * Do option processing on a datagram,
2781  * possibly discarding it if bad options are encountered,
2782  * or forwarding it if source-routed.
2783  * The pass argument is used when operating in the IPSTEALTH
2784  * mode to tell what options to process:
2785  * [LS]SRR (pass 0) or the others (pass 1).
2786  * The reason for as many as two passes is that when doing IPSTEALTH,
2787  * non-routing options should be processed only if the packet is for us.
2788  * Returns 1 if packet has been forwarded/freed,
2789  * 0 if the packet should be processed further.
2790  */
2791 static int
2792 ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
2793 {
2794 #pragma unused(pass)
2795         struct ip *ip = mtod(m, struct ip *);
2796         u_char *cp;
2797         struct ip_timestamp *ipt;
2798         struct in_ifaddr *ia;
2799         int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
2800         struct in_addr *sin, dst;
2801         u_int32_t ntime;
2802         struct sockaddr_in ipaddr = {
2803                 .sin_len = sizeof(ipaddr),
2804                 .sin_family = AF_INET,
2805                 .sin_port = 0,
2806                 .sin_addr = { .s_addr = 0 },
2807                 .sin_zero = { 0, }
2808         };
2809
2810         /* Expect 32-bit aligned data pointer on strict-align platforms */
2811         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
2812
2813         dst = ip->ip_dst;
2814         cp = (u_char *)(ip + 1);
2815         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2816         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2817                 opt = cp[IPOPT_OPTVAL];
2818                 if (opt == IPOPT_EOL) {
2819                         break;
2820                 }
2821                 if (opt == IPOPT_NOP) {
2822                         optlen = 1;
2823                 } else {
2824                         if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2825                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
2826                                 goto bad;
2827                         }
2828                         optlen = cp[IPOPT_OLEN];
2829                         if (optlen < IPOPT_OLEN + sizeof(*cp) ||
2830                             optlen > cnt) {
2831                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
2832                                 goto bad;
2833                         }
2834                 }
2835                 switch (opt) {
2836                 default:
2837                         break;
2838
2839                 /*
2840                  * Source routing with record.
2841                  * Find interface with current destination address.
2842                  * If none on this machine then drop if strictly routed,
2843                  * or do nothing if loosely routed.
2844                  * Record interface address and bring up next address
2845                  * component.  If strictly routed make sure next
2846                  * address is on directly accessible net.
2847                  */
2848                 case IPOPT_LSRR:
2849                 case IPOPT_SSRR:
2850                         if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
2851                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
2852                                 goto bad;
2853                         }
2854                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2855                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2856                                 goto bad;
2857                         }
2858                         ipaddr.sin_addr = ip->ip_dst;
2859                         ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr));
2860                         if (ia == NULL) {
2861                                 if (opt == IPOPT_SSRR) {
2862                                         type = ICMP_UNREACH;
2863                                         code = ICMP_UNREACH_SRCFAIL;
2864                                         goto bad;
2865                                 }
2866                                 if (!ip_dosourceroute) {
2867                                         goto nosourcerouting;
2868                                 }
2869                                 /*
2870                                  * Loose routing, and not at next destination
2871                                  * yet; nothing to do except forward.
2872                                  */
2873                                 break;
2874                         } else {
2875                                 IFA_REMREF(&ia->ia_ifa);
2876                                 ia = NULL;
2877                         }
2878                         off--;                  /* 0 origin */
2879                         if (off > optlen - (int)sizeof(struct in_addr)) {
2880                                 /*
2881                                  * End of source route.  Should be for us.
2882                                  */
2883                                 if (!ip_acceptsourceroute) {
2884                                         goto nosourcerouting;
2885                                 }
2886                                 save_rte(cp, ip->ip_src);
2887                                 break;
2888                         }
2889
2890                         if (!ip_dosourceroute) {
2891                                 if (ipforwarding) {
2892                                         char buf[MAX_IPv4_STR_LEN];
2893                                         char buf2[MAX_IPv4_STR_LEN];
2894                                         /*
2895                                          * Acting as a router, so generate ICMP
2896                                          */
2897 nosourcerouting:
2898                                         log(LOG_WARNING,
2899                                             "attempted source route from %s "
2900                                             "to %s\n",
2901                                             inet_ntop(AF_INET, &ip->ip_src,
2902                                             buf, sizeof(buf)),
2903                                             inet_ntop(AF_INET, &ip->ip_dst,
2904                                             buf2, sizeof(buf2)));
2905                                         type = ICMP_UNREACH;
2906                                         code = ICMP_UNREACH_SRCFAIL;
2907                                         goto bad;
2908                                 } else {
2909                                         /*
2910                                          * Not acting as a router,
2911                                          * so silently drop.
2912                                          */
2913                                         OSAddAtomic(1, &ipstat.ips_cantforward);
2914                                         m_freem(m);
2915                                         return 1;
2916                                 }
2917                         }
2918
2919                         /*
2920                          * locate outgoing interface
2921                          */
2922                         (void) memcpy(&ipaddr.sin_addr, cp + off,
2923                             sizeof(ipaddr.sin_addr));
2924
2925                         if (opt == IPOPT_SSRR) {
2926 #define INA     struct in_ifaddr *
2927                                 if ((ia = (INA)ifa_ifwithdstaddr(
2928                                             SA(&ipaddr))) == NULL) {
2929                                         ia = (INA)ifa_ifwithnet(SA(&ipaddr));
2930                                 }
2931                         } else {
2932                                 ia = ip_rtaddr(ipaddr.sin_addr);
2933                         }
2934                         if (ia == NULL) {
2935                                 type = ICMP_UNREACH;
2936                                 code = ICMP_UNREACH_SRCFAIL;
2937                                 goto bad;
2938                         }
2939                         ip->ip_dst = ipaddr.sin_addr;
2940                         IFA_LOCK(&ia->ia_ifa);
2941                         (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
2942                             sizeof(struct in_addr));
2943                         IFA_UNLOCK(&ia->ia_ifa);
2944                         IFA_REMREF(&ia->ia_ifa);
2945                         ia = NULL;
2946                         cp[IPOPT_OFFSET] += sizeof(struct in_addr);
2947                         /*
2948                          * Let ip_intr's mcast routing check handle mcast pkts
2949                          */
2950                         forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
2951                         break;
2952
2953                 case IPOPT_RR:
2954                         if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
2955                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2956                                 goto bad;
2957                         }
2958                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2959                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2960                                 goto bad;
2961                         }
2962                         /*
2963                          * If no space remains, ignore.
2964                          */
2965                         off--;                  /* 0 origin */
2966                         if (off > optlen - (int)sizeof(struct in_addr)) {
2967                                 break;
2968                         }
2969                         (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst,
2970                             sizeof(ipaddr.sin_addr));
2971                         /*
2972                          * locate outgoing interface; if we're the destination,
2973                          * use the incoming interface (should be same).
2974                          */
2975                         if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) {
2976                                 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
2977                                         type = ICMP_UNREACH;
2978                                         code = ICMP_UNREACH_HOST;
2979                                         goto bad;
2980                                 }
2981                         }
2982                         IFA_LOCK(&ia->ia_ifa);
2983                         (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
2984                             sizeof(struct in_addr));
2985                         IFA_UNLOCK(&ia->ia_ifa);
2986                         IFA_REMREF(&ia->ia_ifa);
2987                         ia = NULL;
2988                         cp[IPOPT_OFFSET] += sizeof(struct in_addr);
2989                         break;
2990
2991                 case IPOPT_TS:
2992                         code = cp - (u_char *)ip;
2993                         ipt = (struct ip_timestamp *)(void *)cp;
2994                         if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
2995                                 code = (u_char *)&ipt->ipt_len - (u_char *)ip;
2996                                 goto bad;
2997                         }
2998                         if (ipt->ipt_ptr < 5) {
2999                                 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
3000                                 goto bad;
3001                         }
3002                         if (ipt->ipt_ptr >
3003                             ipt->ipt_len - (int)sizeof(int32_t)) {
3004                                 if (++ipt->ipt_oflw == 0) {
3005                                         code = (u_char *)&ipt->ipt_ptr -
3006                                             (u_char *)ip;
3007                                         goto bad;
3008                                 }
3009                                 break;
3010                         }
3011                         sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1);
3012                         switch (ipt->ipt_flg) {
3013                         case IPOPT_TS_TSONLY:
3014                                 break;
3015
3016                         case IPOPT_TS_TSANDADDR:
3017                                 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3018                                     sizeof(struct in_addr) > ipt->ipt_len) {
3019                                         code = (u_char *)&ipt->ipt_ptr -
3020                                             (u_char *)ip;
3021                                         goto bad;
3022                                 }
3023                                 ipaddr.sin_addr = dst;
3024                                 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr),
3025                                     m->m_pkthdr.rcvif);
3026                                 if (ia == NULL) {
3027                                         continue;
3028                                 }
3029                                 IFA_LOCK(&ia->ia_ifa);
3030                                 (void) memcpy(sin, &IA_SIN(ia)->sin_addr,
3031                                     sizeof(struct in_addr));
3032                                 IFA_UNLOCK(&ia->ia_ifa);
3033                                 ipt->ipt_ptr += sizeof(struct in_addr);
3034                                 IFA_REMREF(&ia->ia_ifa);
3035                                 ia = NULL;
3036                                 break;
3037
3038                         case IPOPT_TS_PRESPEC:
3039                                 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3040                                     sizeof(struct in_addr) > ipt->ipt_len) {
3041                                         code = (u_char *)&ipt->ipt_ptr -
3042                                             (u_char *)ip;
3043                                         goto bad;
3044                                 }
3045                                 (void) memcpy(&ipaddr.sin_addr, sin,
3046                                     sizeof(struct in_addr));
3047                                 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr(
3048                                             SA(&ipaddr))) == NULL) {
3049                                         continue;
3050                                 }
3051                                 IFA_REMREF(&ia->ia_ifa);
3052                                 ia = NULL;
3053                                 ipt->ipt_ptr += sizeof(struct in_addr);
3054                                 break;
3055
3056                         default:
3057                                 /* XXX can't take &ipt->ipt_flg */
3058                                 code = (u_char *)&ipt->ipt_ptr -
3059                                     (u_char *)ip + 1;
3060                                 goto bad;
3061                         }
3062                         ntime = iptime();
3063                         (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime,
3064                             sizeof(n_time));
3065                         ipt->ipt_ptr += sizeof(n_time);
3066                 }
3067         }
3068         if (forward && ipforwarding) {
3069                 ip_forward(m, 1, next_hop);
3070                 return 1;
3071         }
3072         return 0;
3073 bad:
3074         icmp_error(m, type, code, 0, 0);
3075         OSAddAtomic(1, &ipstat.ips_badoptions);
3076         return 1;
3077 }
3078
3079 /*
3080  * Check for the presence of the IP Router Alert option [RFC2113]
3081  * in the header of an IPv4 datagram.
3082  *
3083  * This call is not intended for use from the forwarding path; it is here
3084  * so that protocol domains may check for the presence of the option.
3085  * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
3086  * option does not have much relevance to the implementation, though this
3087  * may change in future.
3088  * Router alert options SHOULD be passed if running in IPSTEALTH mode and
3089  * we are not the endpoint.
3090  * Length checks on individual options should already have been peformed
3091  * by ip_dooptions() therefore they are folded under DIAGNOSTIC here.
3092  *
3093  * Return zero if not present or options are invalid, non-zero if present.
3094  */
3095 int
3096 ip_checkrouteralert(struct mbuf *m)
3097 {
3098         struct ip *ip = mtod(m, struct ip *);
3099         u_char *cp;
3100         int opt, optlen, cnt, found_ra;
3101
3102         found_ra = 0;
3103         cp = (u_char *)(ip + 1);
3104         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
3105         for (; cnt > 0; cnt -= optlen, cp += optlen) {
3106                 opt = cp[IPOPT_OPTVAL];
3107                 if (opt == IPOPT_EOL) {
3108                         break;
3109                 }
3110                 if (opt == IPOPT_NOP) {
3111                         optlen = 1;
3112                 } else {
3113 #ifdef DIAGNOSTIC
3114                         if (cnt < IPOPT_OLEN + sizeof(*cp)) {
3115                                 break;
3116                         }
3117 #endif
3118                         optlen = cp[IPOPT_OLEN];
3119 #ifdef DIAGNOSTIC
3120                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
3121                                 break;
3122                         }
3123 #endif
3124                 }
3125                 switch (opt) {
3126                 case IPOPT_RA:
3127 #ifdef DIAGNOSTIC
3128                         if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
3129                             (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0)) {
3130                                 break;
3131                         } else
3132 #endif
3133                         found_ra = 1;
3134                         break;
3135                 default:
3136                         break;
3137                 }
3138         }
3139
3140         return found_ra;
3141 }
3142
3143 /*
3144  * Given address of next destination (final or next hop),
3145  * return internet address info of interface to be used to get there.
3146  */
3147 struct in_ifaddr *
3148 ip_rtaddr(struct in_addr dst)
3149 {
3150         struct sockaddr_in *sin;
3151         struct ifaddr *rt_ifa;
3152         struct route ro;
3153
3154         bzero(&ro, sizeof(ro));
3155         sin = SIN(&ro.ro_dst);
3156         sin->sin_family = AF_INET;
3157         sin->sin_len = sizeof(*sin);
3158         sin->sin_addr = dst;
3159
3160         rtalloc_ign(&ro, RTF_PRCLONING);
3161         if (ro.ro_rt == NULL) {
3162                 ROUTE_RELEASE(&ro);
3163                 return NULL;
3164         }
3165
3166         RT_LOCK(ro.ro_rt);
3167         if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) {
3168                 IFA_ADDREF(rt_ifa);
3169         }
3170         RT_UNLOCK(ro.ro_rt);
3171         ROUTE_RELEASE(&ro);
3172
3173         return (struct in_ifaddr *)rt_ifa;
3174 }
3175
3176 /*
3177  * Save incoming source route for use in replies,
3178  * to be picked up later by ip_srcroute if the receiver is interested.
3179  */
3180 void
3181 save_rte(u_char *option, struct in_addr dst)
3182 {
3183         unsigned olen;
3184
3185         olen = option[IPOPT_OLEN];
3186 #if DIAGNOSTIC
3187         if (ipprintfs) {
3188                 printf("save_rte: olen %d\n", olen);
3189         }
3190 #endif
3191         if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) {
3192                 return;
3193         }
3194         bcopy(option, ip_srcrt.srcopt, olen);
3195         ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
3196         ip_srcrt.dst = dst;
3197 }
3198
3199 /*
3200  * Retrieve incoming source route for use in replies,
3201  * in the same form used by setsockopt.
3202  * The first hop is placed before the options, will be removed later.
3203  */
3204 struct mbuf *
3205 ip_srcroute(void)
3206 {
3207         struct in_addr *p, *q;
3208         struct mbuf *m;
3209
3210         if (ip_nhops == 0) {
3211                 return NULL;
3212         }
3213
3214         m = m_get(M_DONTWAIT, MT_HEADER);
3215         if (m == NULL) {
3216                 return NULL;
3217         }
3218
3219 #define OPTSIZ  (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt))
3220
3221         /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
3222         m->m_len = ip_nhops * sizeof(struct in_addr) +
3223             sizeof(struct in_addr) + OPTSIZ;
3224 #if DIAGNOSTIC
3225         if (ipprintfs) {
3226                 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
3227         }
3228 #endif
3229
3230         /*
3231          * First save first hop for return route
3232          */
3233         p = &ip_srcrt.route[ip_nhops - 1];
3234         *(mtod(m, struct in_addr *)) = *p--;
3235 #if DIAGNOSTIC
3236         if (ipprintfs) {
3237                 printf(" hops %lx",
3238                     (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr));
3239         }
3240 #endif
3241
3242         /*
3243          * Copy option fields and padding (nop) to mbuf.
3244          */
3245         ip_srcrt.nop = IPOPT_NOP;
3246         ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
3247         (void) memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
3248             &ip_srcrt.nop, OPTSIZ);
3249         q = (struct in_addr *)(void *)(mtod(m, caddr_t) +
3250             sizeof(struct in_addr) + OPTSIZ);
3251 #undef OPTSIZ
3252         /*
3253          * Record return path as an IP source route,
3254          * reversing the path (pointers are now aligned).
3255          */
3256         while (p >= ip_srcrt.route) {
3257 #if DIAGNOSTIC
3258                 if (ipprintfs) {
3259                         printf(" %lx", (u_int32_t)ntohl(q->s_addr));
3260                 }
3261 #endif
3262                 *q++ = *p--;
3263         }
3264         /*
3265          * Last hop goes to final destination.
3266          */
3267         *q = ip_srcrt.dst;
3268 #if DIAGNOSTIC
3269         if (ipprintfs) {
3270                 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr));
3271         }
3272 #endif
3273         return m;
3274 }
3275
3276 /*
3277  * Strip out IP options, at higher level protocol in the kernel.
3278  */
3279 void
3280 ip_stripoptions(struct mbuf *m)
3281 {
3282         int i;
3283         struct ip *ip = mtod(m, struct ip *);
3284         caddr_t opts;
3285         int olen;
3286
3287         /* Expect 32-bit aligned data pointer on strict-align platforms */
3288         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3289
3290         /* use bcopy() since it supports overlapping range */
3291         olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
3292         opts = (caddr_t)(ip + 1);
3293         i = m->m_len - (sizeof(struct ip) + olen);
3294         bcopy(opts + olen, opts, (unsigned)i);
3295         m->m_len -= olen;
3296         if (m->m_flags & M_PKTHDR) {
3297                 m->m_pkthdr.len -= olen;
3298         }
3299         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
3300
3301         /*
3302          * We expect ip_{off,len} to be in host order by now, and
3303          * that the original IP header length has been subtracted
3304          * out from ip_len.  Temporarily adjust ip_len for checksum
3305          * recalculation, and restore it afterwards.
3306          */
3307         ip->ip_len += sizeof(struct ip);
3308
3309         /* recompute checksum now that IP header is smaller */
3310 #if BYTE_ORDER != BIG_ENDIAN
3311         HTONS(ip->ip_len);
3312         HTONS(ip->ip_off);
3313 #endif /* BYTE_ORDER != BIG_ENDIAN */
3314         ip->ip_sum = in_cksum_hdr(ip);
3315 #if BYTE_ORDER != BIG_ENDIAN
3316         NTOHS(ip->ip_off);
3317         NTOHS(ip->ip_len);
3318 #endif /* BYTE_ORDER != BIG_ENDIAN */
3319
3320         ip->ip_len -= sizeof(struct ip);
3321
3322         /*
3323          * Given that we've just stripped IP options from the header,
3324          * we need to adjust the start offset accordingly if this
3325          * packet had gone thru partial checksum offload.
3326          */
3327         if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3328             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3329                 if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) {
3330                         /* most common case */
3331                         m->m_pkthdr.csum_rx_start -= olen;
3332                 } else {
3333                         /* compute checksum in software instead */
3334                         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3335                         m->m_pkthdr.csum_data = 0;
3336                         ipstat.ips_adj_hwcsum_clr++;
3337                 }
3338         }
3339 }
3340
3341 u_char inetctlerrmap[PRC_NCMDS] = {
3342         0, 0, 0, 0,
3343         0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
3344         ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
3345         EMSGSIZE, EHOSTUNREACH, 0, 0,
3346         0, 0, EHOSTUNREACH, 0,
3347         ENOPROTOOPT, ECONNREFUSED
3348 };
3349
3350 static int
3351 sysctl_ipforwarding SYSCTL_HANDLER_ARGS
3352 {
3353 #pragma unused(arg1, arg2)
3354         int i, was_ipforwarding = ipforwarding;
3355
3356         i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
3357         if (i != 0 || req->newptr == USER_ADDR_NULL) {
3358                 return i;
3359         }
3360
3361         if (was_ipforwarding && !ipforwarding) {
3362                 /* clean up IPv4 forwarding cached routes */
3363                 ifnet_head_lock_shared();
3364                 for (i = 0; i <= if_index; i++) {
3365                         struct ifnet *ifp = ifindex2ifnet[i];
3366                         if (ifp != NULL) {
3367                                 lck_mtx_lock(&ifp->if_cached_route_lock);
3368                                 ROUTE_RELEASE(&ifp->if_fwd_route);
3369                                 bzero(&ifp->if_fwd_route,
3370                                     sizeof(ifp->if_fwd_route));
3371                                 lck_mtx_unlock(&ifp->if_cached_route_lock);
3372                         }
3373                 }
3374                 ifnet_head_done();
3375         }
3376
3377         return 0;
3378 }
3379
3380 /*
3381  * Similar to inp_route_{copyout,copyin} routines except that these copy
3382  * out the cached IPv4 forwarding route from struct ifnet instead of the
3383  * inpcb.  See comments for those routines for explanations.
3384  */
3385 static void
3386 ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst)
3387 {
3388         struct route *src = &ifp->if_fwd_route;
3389
3390         lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3391         lck_mtx_convert_spin(&ifp->if_cached_route_lock);
3392
3393         /* Minor sanity check */
3394         if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3395                 panic("%s: wrong or corrupted route: %p", __func__, src);
3396         }
3397
3398         route_copyout(dst, src, sizeof(*dst));
3399
3400         lck_mtx_unlock(&ifp->if_cached_route_lock);
3401 }
3402
3403 static void
3404 ip_fwd_route_copyin(struct ifnet *ifp, struct route *src)
3405 {
3406         struct route *dst = &ifp->if_fwd_route;
3407
3408         lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3409         lck_mtx_convert_spin(&ifp->if_cached_route_lock);
3410
3411         /* Minor sanity check */
3412         if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3413                 panic("%s: wrong or corrupted route: %p", __func__, src);
3414         }
3415
3416         if (ifp->if_fwd_cacheok) {
3417                 route_copyin(src, dst, sizeof(*src));
3418         }
3419
3420         lck_mtx_unlock(&ifp->if_cached_route_lock);
3421 }
3422
3423 /*
3424  * Forward a packet.  If some error occurs return the sender
3425  * an icmp packet.  Note we can't always generate a meaningful
3426  * icmp message because icmp doesn't have a large enough repertoire
3427  * of codes and types.
3428  *
3429  * If not forwarding, just drop the packet.  This could be confusing
3430  * if ipforwarding was zero but some routing protocol was advancing
3431  * us as a gateway to somewhere.  However, we must let the routing
3432  * protocol deal with that.
3433  *
3434  * The srcrt parameter indicates whether the packet is being forwarded
3435  * via a source route.
3436  */
3437 static void
3438 ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
3439 {
3440 #pragma unused(next_hop)
3441         struct ip *ip = mtod(m, struct ip *);
3442         struct sockaddr_in *sin;
3443         struct rtentry *rt;
3444         struct route fwd_rt;
3445         int error, type = 0, code = 0;
3446         struct mbuf *mcopy;
3447         n_long dest;
3448         struct in_addr pkt_dst;
3449         u_int32_t nextmtu = 0, len;
3450         struct ip_out_args ipoa;
3451         struct ifnet *rcvifp = m->m_pkthdr.rcvif;
3452
3453         bzero(&ipoa, sizeof(ipoa));
3454         ipoa.ipoa_boundif = IFSCOPE_NONE;
3455         ipoa.ipoa_sotc = SO_TC_UNSPEC;
3456         ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3457
3458 #if IPSEC
3459         struct secpolicy *sp = NULL;
3460         int ipsecerror;
3461 #endif /* IPSEC */
3462 #if PF
3463         struct pf_mtag *pf_mtag;
3464 #endif /* PF */
3465
3466         dest = 0;
3467         pkt_dst = ip->ip_dst;
3468
3469 #if DIAGNOSTIC
3470         if (ipprintfs) {
3471                 printf("forward: src %lx dst %lx ttl %x\n",
3472                     (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr,
3473                     ip->ip_ttl);
3474         }
3475 #endif
3476
3477         if (m->m_flags & (M_BCAST | M_MCAST) || !in_canforward(pkt_dst)) {
3478                 OSAddAtomic(1, &ipstat.ips_cantforward);
3479                 m_freem(m);
3480                 return;
3481         }
3482 #if IPSTEALTH
3483         if (!ipstealth) {
3484 #endif /* IPSTEALTH */
3485         if (ip->ip_ttl <= IPTTLDEC) {
3486                 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
3487                     dest, 0);
3488                 return;
3489         }
3490 #if IPSTEALTH
3491 }
3492 #endif /* IPSTEALTH */
3493
3494 #if PF
3495         pf_mtag = pf_find_mtag(m);
3496         if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
3497                 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid;
3498                 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
3499         }
3500 #endif /* PF */
3501
3502         ip_fwd_route_copyout(rcvifp, &fwd_rt);
3503
3504         sin = SIN(&fwd_rt.ro_dst);
3505         if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) {
3506                 ROUTE_RELEASE(&fwd_rt);
3507
3508                 sin->sin_family = AF_INET;
3509                 sin->sin_len = sizeof(*sin);
3510                 sin->sin_addr = pkt_dst;
3511
3512                 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif);
3513                 if (fwd_rt.ro_rt == NULL) {
3514                         icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
3515                         goto done;
3516                 }
3517         }
3518         rt = fwd_rt.ro_rt;
3519
3520         /*
3521          * Save the IP header and at most 8 bytes of the payload,
3522          * in case we need to generate an ICMP message to the src.
3523          *
3524          * We don't use m_copy() because it might return a reference
3525          * to a shared cluster. Both this function and ip_output()
3526          * assume exclusive access to the IP header in `m', so any
3527          * data in a cluster may change before we reach icmp_error().
3528          */
3529         MGET(mcopy, M_DONTWAIT, m->m_type);
3530         if (mcopy != NULL) {
3531                 M_COPY_PKTHDR(mcopy, m);
3532                 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
3533                     (int)ip->ip_len);
3534                 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
3535         }
3536
3537 #if IPSTEALTH
3538         if (!ipstealth) {
3539 #endif /* IPSTEALTH */
3540         ip->ip_ttl -= IPTTLDEC;
3541 #if IPSTEALTH
3542 }
3543 #endif /* IPSTEALTH */
3544
3545         /*
3546          * If forwarding packet using same interface that it came in on,
3547          * perhaps should send a redirect to sender to shortcut a hop.
3548          * Only send redirect if source is sending directly to us,
3549          * and if packet was not source routed (or has any options).
3550          * Also, don't send redirect if forwarding using a default route
3551          * or a route modified by a redirect.
3552          */
3553         RT_LOCK_SPIN(rt);
3554         if (rt->rt_ifp == m->m_pkthdr.rcvif &&
3555             !(rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED)) &&
3556             satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
3557             ipsendredirects && !srcrt && rt->rt_ifa != NULL) {
3558                 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa;
3559                 u_int32_t src = ntohl(ip->ip_src.s_addr);
3560
3561                 /* Become a regular mutex */
3562                 RT_CONVERT_LOCK(rt);
3563                 IFA_LOCK_SPIN(&ia->ia_ifa);
3564                 if ((src & ia->ia_subnetmask) == ia->ia_subnet) {
3565                         if (rt->rt_flags & RTF_GATEWAY) {
3566                                 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
3567                         } else {
3568                                 dest = pkt_dst.s_addr;
3569                         }
3570                         /*
3571                          * Router requirements says to only send
3572                          * host redirects.
3573                          */
3574                         type = ICMP_REDIRECT;
3575                         code = ICMP_REDIRECT_HOST;
3576 #if DIAGNOSTIC
3577                         if (ipprintfs) {
3578                                 printf("redirect (%d) to %lx\n", code,
3579                                     (u_int32_t)dest);
3580                         }
3581 #endif
3582                 }
3583                 IFA_UNLOCK(&ia->ia_ifa);
3584         }
3585         RT_UNLOCK(rt);
3586
3587
3588         /* Mark this packet as being forwarded from another interface */
3589         m->m_pkthdr.pkt_flags |= PKTF_FORWARDED;
3590         len = m_pktlen(m);
3591
3592         error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS,
3593             NULL, &ipoa);
3594
3595         /* Refresh rt since the route could have changed while in IP */
3596         rt = fwd_rt.ro_rt;
3597
3598         if (error != 0) {
3599                 OSAddAtomic(1, &ipstat.ips_cantforward);
3600         } else {
3601                 /*
3602                  * Increment stats on the source interface; the ones
3603                  * for destination interface has been taken care of
3604                  * during output above by virtue of PKTF_FORWARDED.
3605                  */
3606                 rcvifp->if_fpackets++;
3607                 rcvifp->if_fbytes += len;
3608
3609                 OSAddAtomic(1, &ipstat.ips_forward);
3610                 if (type != 0) {
3611                         OSAddAtomic(1, &ipstat.ips_redirectsent);
3612                 } else {
3613                         if (mcopy != NULL) {
3614                                 /*
3615                                  * If we didn't have to go thru ipflow and
3616                                  * the packet was successfully consumed by
3617                                  * ip_output, the mcopy is rather a waste;
3618                                  * this could be further optimized.
3619                                  */
3620                                 m_freem(mcopy);
3621                         }
3622                         goto done;
3623                 }
3624         }
3625         if (mcopy == NULL) {
3626                 goto done;
3627         }
3628
3629         switch (error) {
3630         case 0:                         /* forwarded, but need redirect */
3631                 /* type, code set above */
3632                 break;
3633
3634         case ENETUNREACH:               /* shouldn't happen, checked above */
3635         case EHOSTUNREACH:
3636         case ENETDOWN:
3637         case EHOSTDOWN:
3638         default:
3639                 type = ICMP_UNREACH;
3640                 code = ICMP_UNREACH_HOST;
3641                 break;
3642
3643         case EMSGSIZE:
3644                 type = ICMP_UNREACH;
3645                 code = ICMP_UNREACH_NEEDFRAG;
3646
3647                 if (rt == NULL) {
3648                         break;
3649                 } else {
3650                         RT_LOCK_SPIN(rt);
3651                         if (rt->rt_ifp != NULL) {
3652                                 nextmtu = rt->rt_ifp->if_mtu;
3653                         }
3654                         RT_UNLOCK(rt);
3655                 }
3656 #ifdef IPSEC
3657                 if (ipsec_bypass) {
3658                         break;
3659                 }
3660
3661                 /*
3662                  * If the packet is routed over IPsec tunnel, tell the
3663                  * originator the tunnel MTU.
3664                  *      tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
3665                  * XXX quickhack!!!
3666                  */
3667                 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
3668                     IP_FORWARDING, &ipsecerror);
3669
3670                 if (sp == NULL) {
3671                         break;
3672                 }
3673
3674                 /*
3675                  * find the correct route for outer IPv4
3676                  * header, compute tunnel MTU.
3677                  */
3678                 nextmtu = 0;
3679
3680                 if (sp->req != NULL &&
3681                     sp->req->saidx.mode == IPSEC_MODE_TUNNEL) {
3682                         struct secasindex saidx;
3683                         struct secasvar *sav;
3684                         struct route *ro;
3685                         struct ip *ipm;
3686                         int ipsechdr;
3687
3688                         /* count IPsec header size */
3689                         ipsechdr = ipsec_hdrsiz(sp);
3690
3691                         ipm = mtod(mcopy, struct ip *);
3692                         bcopy(&sp->req->saidx, &saidx, sizeof(saidx));
3693                         saidx.mode = sp->req->saidx.mode;
3694                         saidx.reqid = sp->req->saidx.reqid;
3695                         sin = SIN(&saidx.src);
3696                         if (sin->sin_len == 0) {
3697                                 sin->sin_len = sizeof(*sin);
3698                                 sin->sin_family = AF_INET;
3699                                 sin->sin_port = IPSEC_PORT_ANY;
3700                                 bcopy(&ipm->ip_src, &sin->sin_addr,
3701                                     sizeof(sin->sin_addr));
3702                         }
3703                         sin = SIN(&saidx.dst);
3704                         if (sin->sin_len == 0) {
3705                                 sin->sin_len = sizeof(*sin);
3706                                 sin->sin_family = AF_INET;
3707                                 sin->sin_port = IPSEC_PORT_ANY;
3708                                 bcopy(&ipm->ip_dst, &sin->sin_addr,
3709                                     sizeof(sin->sin_addr));
3710                         }
3711                         sav = key_allocsa_policy(&saidx);
3712                         if (sav != NULL) {
3713                                 lck_mtx_lock(sadb_mutex);
3714                                 if (sav->sah != NULL) {
3715                                         ro = (struct route *)&sav->sah->sa_route;
3716                                         if (ro->ro_rt != NULL) {
3717                                                 RT_LOCK(ro->ro_rt);
3718                                                 if (ro->ro_rt->rt_ifp != NULL) {
3719                                                         nextmtu = ro->ro_rt->
3720                                                             rt_ifp->if_mtu;
3721                                                         nextmtu -= ipsechdr;
3722                                                 }
3723                                                 RT_UNLOCK(ro->ro_rt);
3724                                         }
3725                                 }
3726                                 key_freesav(sav, KEY_SADB_LOCKED);
3727                                 lck_mtx_unlock(sadb_mutex);
3728                         }
3729                 }
3730                 key_freesp(sp, KEY_SADB_UNLOCKED);
3731 #endif /* IPSEC */
3732                 break;
3733
3734         case ENOBUFS:
3735                 /*
3736                  * A router should not generate ICMP_SOURCEQUENCH as
3737                  * required in RFC1812 Requirements for IP Version 4 Routers.
3738                  * Source quench could be a big problem under DoS attacks,
3739                  * or if the underlying interface is rate-limited.
3740                  * Those who need source quench packets may re-enable them
3741                  * via the net.inet.ip.sendsourcequench sysctl.
3742                  */
3743                 if (ip_sendsourcequench == 0) {
3744                         m_freem(mcopy);
3745                         goto done;
3746                 } else {
3747                         type = ICMP_SOURCEQUENCH;
3748                         code = 0;
3749                 }
3750                 break;
3751
3752         case EACCES:
3753                 m_freem(mcopy);
3754                 goto done;
3755         }
3756
3757         if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) {
3758                 OSAddAtomic(1, &ipstat.ips_cantfrag);
3759         }
3760
3761         icmp_error(mcopy, type, code, dest, nextmtu);
3762 done:
3763         ip_fwd_route_copyin(rcvifp, &fwd_rt);
3764 }
3765
3766 int
3767 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
3768     struct mbuf *m)
3769 {
3770         *mp = NULL;
3771         if (inp->inp_socket->so_options & SO_TIMESTAMP) {
3772                 struct timeval tv;
3773
3774                 getmicrotime(&tv);
3775                 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof(tv),
3776                     SCM_TIMESTAMP, SOL_SOCKET, mp);
3777                 if (*mp == NULL) {
3778                         goto no_mbufs;
3779                 }
3780         }
3781         if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) {
3782                 uint64_t time;
3783
3784                 time = mach_absolute_time();
3785                 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
3786                     SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp);
3787                 if (*mp == NULL) {
3788                         goto no_mbufs;
3789                 }
3790         }
3791         if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) {
3792                 uint64_t time;
3793
3794                 time = mach_continuous_time();
3795                 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
3796                     SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp);
3797                 if (*mp == NULL) {
3798                         goto no_mbufs;
3799                 }
3800         }
3801         if (inp->inp_flags & INP_RECVDSTADDR
3802 #if CONTENT_FILTER
3803             /* Content Filter needs to see local address */
3804             || (inp->inp_socket->so_cfil_db != NULL)
3805 #endif
3806             ) {
3807                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst,
3808                     sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp);
3809                 if (*mp == NULL) {
3810                         goto no_mbufs;
3811                 }
3812         }
3813 #ifdef notyet
3814         /*
3815          * XXX
3816          * Moving these out of udp_input() made them even more broken
3817          * than they already were.
3818          */
3819         /* options were tossed already */
3820         if (inp->inp_flags & INP_RECVOPTS) {
3821                 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above,
3822                     sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp);
3823                 if (*mp == NULL) {
3824                         goto no_mbufs;
3825                 }
3826         }
3827         /* ip_srcroute doesn't do what we want here, need to fix */
3828         if (inp->inp_flags & INP_RECVRETOPTS) {
3829                 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(),
3830                     sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp);
3831                 if (*mp == NULL) {
3832                         goto no_mbufs;
3833                 }
3834         }
3835 #endif /* notyet */
3836         if (inp->inp_flags & INP_RECVIF) {
3837                 struct ifnet *ifp;
3838                 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1];
3839                 struct sockaddr_dl *sdl2 = SDL(&sdlbuf);
3840
3841                 /*
3842                  * Make sure to accomodate the largest possible
3843                  * size of SA(if_lladdr)->sa_len.
3844                  */
3845                 _CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1));
3846
3847                 ifnet_head_lock_shared();
3848                 if ((ifp = m->m_pkthdr.rcvif) != NULL &&
3849                     ifp->if_index && (ifp->if_index <= if_index)) {
3850                         struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1];
3851                         struct sockaddr_dl *sdp;
3852
3853                         if (!ifa || !ifa->ifa_addr) {
3854                                 goto makedummy;
3855                         }
3856
3857                         IFA_LOCK_SPIN(ifa);
3858                         sdp = SDL(ifa->ifa_addr);
3859                         /*
3860                          * Change our mind and don't try copy.
3861                          */
3862                         if (sdp->sdl_family != AF_LINK) {
3863                                 IFA_UNLOCK(ifa);
3864                                 goto makedummy;
3865                         }
3866                         /* the above _CASSERT ensures sdl_len fits in sdlbuf */
3867                         bcopy(sdp, sdl2, sdp->sdl_len);
3868                         IFA_UNLOCK(ifa);
3869                 } else {
3870 makedummy:
3871                         sdl2->sdl_len =
3872                             offsetof(struct sockaddr_dl, sdl_data[0]);
3873                         sdl2->sdl_family = AF_LINK;
3874                         sdl2->sdl_index = 0;
3875                         sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
3876                 }
3877                 ifnet_head_done();
3878                 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len,
3879                     IP_RECVIF, IPPROTO_IP, mp);
3880                 if (*mp == NULL) {
3881                         goto no_mbufs;
3882                 }
3883         }
3884         if (inp->inp_flags & INP_RECVTTL) {
3885                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl,
3886                     sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp);
3887                 if (*mp == NULL) {
3888                         goto no_mbufs;
3889                 }
3890         }
3891         if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) {
3892                 int tc = m_get_traffic_class(m);
3893
3894                 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof(tc),
3895                     SO_TRAFFIC_CLASS, SOL_SOCKET, mp);
3896                 if (*mp == NULL) {
3897                         goto no_mbufs;
3898                 }
3899         }
3900         if (inp->inp_flags & INP_PKTINFO) {
3901                 struct in_pktinfo pi;
3902
3903                 bzero(&pi, sizeof(struct in_pktinfo));
3904                 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr));
3905                 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ?
3906                     m->m_pkthdr.rcvif->if_index : 0;
3907
3908                 mp = sbcreatecontrol_mbuf((caddr_t)&pi,
3909                     sizeof(struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp);
3910                 if (*mp == NULL) {
3911                         goto no_mbufs;
3912                 }
3913         }
3914         if (inp->inp_flags & INP_RECVTOS) {
3915                 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_tos,
3916                     sizeof(u_char), IP_RECVTOS, IPPROTO_IP, mp);
3917                 if (*mp == NULL) {
3918                         goto no_mbufs;
3919                 }
3920         }
3921         return 0;
3922
3923 no_mbufs:
3924         ipstat.ips_pktdropcntrl++;
3925         return ENOBUFS;
3926 }
3927
3928 static inline u_short
3929 ip_cksum(struct mbuf *m, int hlen)
3930 {
3931         u_short sum;
3932
3933         if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
3934                 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
3935         } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
3936             !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
3937                 /*
3938                  * The packet arrived on an interface which isn't capable
3939                  * of performing IP header checksum; compute it now.
3940                  */
3941                 sum = ip_cksum_hdr_in(m, hlen);
3942         } else {
3943                 sum = 0;
3944                 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3945                     CSUM_IP_CHECKED | CSUM_IP_VALID);
3946                 m->m_pkthdr.csum_data = 0xffff;
3947         }
3948
3949         if (sum != 0) {
3950                 OSAddAtomic(1, &ipstat.ips_badsum);
3951         }
3952
3953         return sum;
3954 }
3955
3956 static int
3957 ip_getstat SYSCTL_HANDLER_ARGS
3958 {
3959 #pragma unused(oidp, arg1, arg2)
3960         if (req->oldptr == USER_ADDR_NULL) {
3961                 req->oldlen = (size_t)sizeof(struct ipstat);
3962         }
3963
3964         return SYSCTL_OUT(req, &ipstat, MIN(sizeof(ipstat), req->oldlen));
3965 }
3966
3967 void
3968 ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia)
3969 {
3970         VERIFY(m->m_flags & M_PKTHDR);
3971
3972         /*
3973          * If the source ifaddr is specified, pick up the information
3974          * from there; otherwise just grab the passed-in ifindex as the
3975          * caller may not have the ifaddr available.
3976          */
3977         if (ia != NULL) {
3978                 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
3979                 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index;
3980         } else {
3981                 m->m_pkthdr.src_ifindex = src_idx;
3982                 if (src_idx != 0) {
3983                         m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
3984                 }
3985         }
3986 }
3987
3988 void
3989 ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia)
3990 {
3991         VERIFY(m->m_flags & M_PKTHDR);
3992
3993         /*
3994          * If the destination ifaddr is specified, pick up the information
3995          * from there; otherwise just grab the passed-in ifindex as the
3996          * caller may not have the ifaddr available.
3997          */
3998         if (ia != NULL) {
3999                 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4000                 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index;
4001         } else {
4002                 m->m_pkthdr.dst_ifindex = dst_idx;
4003                 if (dst_idx != 0) {
4004                         m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4005                 }
4006         }
4007 }
4008
4009 int
4010 ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf)
4011 {
4012         VERIFY(m->m_flags & M_PKTHDR);
4013
4014         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4015                 return -1;
4016         }
4017
4018         if (src_idx != NULL) {
4019                 *src_idx = m->m_pkthdr.src_ifindex;
4020         }
4021
4022         if (iaf != NULL) {
4023                 *iaf = 0;
4024         }
4025
4026         return 0;
4027 }
4028
4029 int
4030 ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf)
4031 {
4032         VERIFY(m->m_flags & M_PKTHDR);
4033
4034         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4035                 return -1;
4036         }
4037
4038         if (dst_idx != NULL) {
4039                 *dst_idx = m->m_pkthdr.dst_ifindex;
4040         }
4041
4042         if (iaf != NULL) {
4043                 *iaf = 0;
4044         }
4045
4046         return 0;
4047 }
4048
4049 /*
4050  * Protocol input handler for IPPROTO_GRE.
4051  */
4052 void
4053 gre_input(struct mbuf *m, int off)
4054 {
4055         gre_input_func_t fn = gre_input_func;
4056
4057         /*
4058          * If there is a registered GRE input handler, pass mbuf to it.
4059          */
4060         if (fn != NULL) {
4061                 lck_mtx_unlock(inet_domain_mutex);
4062                 m = fn(m, off, (mtod(m, struct ip *))->ip_p);
4063                 lck_mtx_lock(inet_domain_mutex);
4064         }
4065
4066         /*
4067          * If no matching tunnel that is up is found, we inject
4068          * the mbuf to raw ip socket to see if anyone picks it up.
4069          */
4070         if (m != NULL) {
4071                 rip_input(m, off);
4072         }
4073 }
4074
4075 /*
4076  * Private KPI for PPP/PPTP.
4077  */
4078 int
4079 ip_gre_register_input(gre_input_func_t fn)
4080 {
4081         lck_mtx_lock(inet_domain_mutex);
4082         gre_input_func = fn;
4083         lck_mtx_unlock(inet_domain_mutex);
4084
4085         return 0;
4086 }
4087
4088 #if (DEBUG || DEVELOPMENT)
4089 static int
4090 sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS
4091 {
4092 #pragma unused(arg1, arg2)
4093         int error, i;
4094
4095         i = ip_input_measure;
4096         error = sysctl_handle_int(oidp, &i, 0, req);
4097         if (error || req->newptr == USER_ADDR_NULL) {
4098                 goto done;
4099         }
4100         /* impose bounds */
4101         if (i < 0 || i > 1) {
4102                 error = EINVAL;
4103                 goto done;
4104         }
4105         if (ip_input_measure != i && i == 1) {
4106                 net_perf_initialize(&net_perf, ip_input_measure_bins);
4107         }
4108         ip_input_measure = i;
4109 done:
4110         return error;
4111 }
4112
4113 static int
4114 sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS
4115 {
4116 #pragma unused(arg1, arg2)
4117         int error;
4118         uint64_t i;
4119
4120         i = ip_input_measure_bins;
4121         error = sysctl_handle_quad(oidp, &i, 0, req);
4122         if (error || req->newptr == USER_ADDR_NULL) {
4123                 goto done;
4124         }
4125         /* validate data */
4126         if (!net_perf_validate_bins(i)) {
4127                 error = EINVAL;
4128                 goto done;
4129         }
4130         ip_input_measure_bins = i;
4131 done:
4132         return error;
4133 }
4134
4135 static int
4136 sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS
4137 {
4138 #pragma unused(oidp, arg1, arg2)
4139         if (req->oldptr == USER_ADDR_NULL) {
4140                 req->oldlen = (size_t)sizeof(struct ipstat);
4141         }
4142
4143         return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
4144 }
4145 #endif /* (DEBUG || DEVELOPMENT) */
4146
4147 static int
4148 sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS
4149 {
4150 #pragma unused(arg1, arg2)
4151         int error, i;
4152
4153         i = ip_checkinterface;
4154         error = sysctl_handle_int(oidp, &i, 0, req);
4155         if (error != 0 || req->newptr == USER_ADDR_NULL) {
4156                 return error;
4157         }
4158
4159         switch (i) {
4160         case IP_CHECKINTERFACE_WEAK_ES:
4161         case IP_CHECKINTERFACE_HYBRID_ES:
4162         case IP_CHECKINTERFACE_STRONG_ES:
4163                 if (ip_checkinterface != i) {
4164                         ip_checkinterface = i;
4165                         os_log(OS_LOG_DEFAULT, "%s: ip_checkinterface is now %d\n",
4166                             __func__, ip_checkinterface);
4167                 }
4168                 break;
4169         default:
4170                 error = EINVAL;
4171                 break;
4172         }
4173         return error;
4174 }