]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/ip_input.c
xnu-7195.60.75.tar.gz
[apple/xnu.git] / bsd / netinet / ip_input.c
CommitLineData
1c79356b 1/*
eb6b6ca3 2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
1c79356b 61 */
2d21ac55
A
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
1c79356b 68
0a7de745 69#define _IP_VHL
1c79356b 70
1c79356b
A
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/mbuf.h>
74#include <sys/malloc.h>
75#include <sys/domain.h>
76#include <sys/protosw.h>
77#include <sys/socket.h>
78#include <sys/time.h>
79#include <sys/kernel.h>
80#include <sys/syslog.h>
81#include <sys/sysctl.h>
6d2010ae 82#include <sys/mcache.h>
39236c6e
A
83#include <sys/socketvar.h>
84#include <sys/kdebug.h>
6d2010ae 85#include <mach/mach_time.h>
39236c6e 86#include <mach/sdt.h>
1c79356b 87
b0d623f7 88#include <machine/endian.h>
39236c6e 89#include <dev/random/randomdev.h>
b0d623f7 90
1c79356b 91#include <kern/queue.h>
91447636 92#include <kern/locks.h>
39236c6e 93#include <libkern/OSAtomic.h>
1c79356b 94
2d21ac55
A
95#include <pexpert/pexpert.h>
96
1c79356b
A
97#include <net/if.h>
98#include <net/if_var.h>
99#include <net/if_dl.h>
100#include <net/route.h>
91447636 101#include <net/kpi_protocol.h>
6d2010ae 102#include <net/ntstat.h>
39236c6e
A
103#include <net/dlil.h>
104#include <net/classq/classq.h>
3e170ce0 105#include <net/net_perf.h>
39037602 106#include <net/init.h>
39236c6e
A
107#if PF
108#include <net/pfvar.h>
109#endif /* PF */
1c79356b
A
110
111#include <netinet/in.h>
112#include <netinet/in_systm.h>
113#include <netinet/in_var.h>
b0d623f7 114#include <netinet/in_arp.h>
1c79356b 115#include <netinet/ip.h>
1c79356b
A
116#include <netinet/in_pcb.h>
117#include <netinet/ip_var.h>
118#include <netinet/ip_icmp.h>
91447636 119#include <netinet/kpi_ipfilter_var.h>
9bccf70c
A
120#include <netinet/udp.h>
121#include <netinet/udp_var.h>
122#include <netinet/bootp.h>
39236c6e
A
123
124#if DUMMYNET
125#include <netinet/ip_dummynet.h>
126#endif /* DUMMYNET */
9bccf70c 127
1c79356b
A
128#if IPSEC
129#include <netinet6/ipsec.h>
130#include <netkey/key.h>
39236c6e 131#endif /* IPSEC */
1c79356b 132
eb6b6ca3
A
133#include <os/log.h>
134
0a7de745
A
135#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0)
136#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2)
137#define DBG_FNC_IP_INPUT NETDBG_CODE(DBG_NETIP, (2 << 8))
316670eb 138
9bccf70c
A
139#if IPSEC
140extern int ipsec_bypass;
91447636 141extern lck_mtx_t *sadb_mutex;
b0d623f7 142
0a7de745
A
143lck_grp_t *sadb_stat_mutex_grp;
144lck_grp_attr_t *sadb_stat_mutex_grp_attr;
145lck_attr_t *sadb_stat_mutex_attr;
316670eb 146decl_lck_mtx_data(, sadb_stat_mutex_data);
0a7de745 147lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data;
39236c6e 148#endif /* IPSEC */
9bccf70c 149
39236c6e
A
150MBUFQ_HEAD(fq_head);
151
0a7de745 152static int frag_timeout_run; /* frag timer is scheduled to run */
39236c6e
A
153static void frag_timeout(void *);
154static void frag_sched_timeout(void);
155
156static struct ipq *ipq_alloc(int);
157static void ipq_free(struct ipq *);
158static void ipq_updateparams(void);
3e170ce0 159static void ip_input_second_pass(struct mbuf *, struct ifnet *,
f427ee49 160 int, int, struct ip_fw_in_args *);
39236c6e
A
161
162decl_lck_mtx_data(static, ipqlock);
0a7de745
A
163static lck_attr_t *ipqlock_attr;
164static lck_grp_t *ipqlock_grp;
165static lck_grp_attr_t *ipqlock_grp_attr;
39236c6e
A
166
167/* Packet reassembly stuff */
0a7de745
A
168#define IPREASS_NHASH_LOG2 6
169#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
170#define IPREASS_HMASK (IPREASS_NHASH - 1)
171#define IPREASS_HASH(x, y) \
39236c6e
A
172 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
173
174/* IP fragment reassembly queues (protected by ipqlock) */
175static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */
0a7de745
A
176static int maxnipq; /* max packets in reass queues */
177static u_int32_t maxfragsperpacket; /* max frags/packet in reass queues */
178static u_int32_t nipq; /* # of packets in reass queues */
179static u_int32_t ipq_limit; /* ipq allocation limit */
180static u_int32_t ipq_count; /* current # of allocated ipq's */
1c79356b 181
b0d623f7 182static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS;
39236c6e
A
183static int sysctl_maxnipq SYSCTL_HANDLER_ARGS;
184static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS;
39037602
A
185
186#if (DEBUG || DEVELOPMENT)
3e170ce0
A
187static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS;
188static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS;
189static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS;
39037602 190#endif /* (DEBUG || DEVELOPMENT) */
b0d623f7 191
39236c6e 192int ipforwarding = 0;
b0d623f7 193SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding,
0a7de745
A
194 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0,
195 sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces");
1c79356b 196
39236c6e
A
197static int ipsendredirects = 1; /* XXX */
198SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect,
0a7de745
A
199 CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0,
200 "Enable sending IP redirects");
1c79356b 201
39236c6e 202int ip_defttl = IPDEFTTL;
6d2010ae 203SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 204 &ip_defttl, 0, "Maximum TTL on IP packets");
39236c6e
A
205
206static int ip_dosourceroute = 0;
207SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
0a7de745
A
208 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0,
209 "Enable forwarding source routed IP packets");
39236c6e
A
210
211static int ip_acceptsourceroute = 0;
212SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
0a7de745
A
213 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0,
214 "Enable accepting source routed IP packets");
483a1d10 215
39236c6e
A
216static int ip_sendsourcequench = 0;
217SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench,
0a7de745
A
218 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0,
219 "Enable the transmission of source quench packets");
91447636 220
39236c6e 221SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
0a7de745
A
222 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq,
223 "I", "Maximum number of IPv4 fragment reassembly queue entries");
91447636 224
39236c6e 225SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED,
0a7de745 226 &nipq, 0, "Current number of IPv4 fragment reassembly queue entries");
39236c6e
A
227
228SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket,
0a7de745
A
229 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0,
230 sysctl_maxfragsperpacket, "I",
231 "Maximum number of IPv4 fragments allowed per packet");
39236c6e 232
39236c6e
A
233static uint32_t ip_adj_clear_hwcksum = 0;
234SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum,
0a7de745
A
235 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0,
236 "Invalidate hwcksum info when adjusting length");
316670eb 237
5ba3f43e
A
238static uint32_t ip_adj_partial_sum = 1;
239SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_partial_sum,
0a7de745
A
240 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_partial_sum, 0,
241 "Perform partial sum adjustment of trailing bytes at IP layer");
5ba3f43e 242
9bccf70c 243/*
eb6b6ca3
A
244 * ip_checkinterface controls the receive side of the models for multihoming
245 * that are discussed in RFC 1122.
246 *
247 * ip_checkinterface values are:
248 * IP_CHECKINTERFACE_WEAK_ES:
249 * This corresponds to the Weak End-System model where incoming packets from
250 * any interface are accepted provided the destination address of the incoming packet
251 * is assigned to some interface.
252 *
253 * IP_CHECKINTERFACE_HYBRID_ES:
254 * The Hybrid End-System model use the Strong End-System for tunnel interfaces
255 * (ipsec and utun) and the weak End-System model for other interfaces families.
256 * This prevents a rogue middle box to probe for signs of TCP connections
257 * that use the tunnel interface.
258 *
259 * IP_CHECKINTERFACE_STRONG_ES:
260 * The Strong model model requires the packet arrived on an interface that
261 * is assigned the destination address of the packet.
9bccf70c 262 *
eb6b6ca3
A
263 * Since the routing table and transmit implementation do not implement the Strong ES model,
264 * setting this to a value different from IP_CHECKINTERFACE_WEAK_ES may lead to unexpected results.
265 *
266 * When forwarding is enabled, the system reverts to the Weak ES model as a router
267 * is expected by design to receive packets from several interfaces to the same address.
268 *
269 * XXX - ip_checkinterface currently must be set to IP_CHECKINTERFACE_WEAK_ES if you use ipnat
9bccf70c
A
270 * to translate the destination address to another local interface.
271 *
eb6b6ca3 272 * XXX - ip_checkinterface must be set to IP_CHECKINTERFACE_WEAK_ES if you add IP aliases
9bccf70c
A
273 * to the loopback interface instead of the interface where the
274 * packets for those addresses are received.
275 */
eb6b6ca3
A
276#define IP_CHECKINTERFACE_WEAK_ES 0
277#define IP_CHECKINTERFACE_HYBRID_ES 1
278#define IP_CHECKINTERFACE_STRONG_ES 2
279
280static int ip_checkinterface = IP_CHECKINTERFACE_HYBRID_ES;
281
282static int sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS;
283SYSCTL_PROC(_net_inet_ip, OID_AUTO, check_interface,
284 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
285 0, 0, sysctl_ip_checkinterface, "I", "Verify packet arrives on correct interface");
286
287#if (DEBUG || DEVELOPMENT)
288#define IP_CHECK_IF_DEBUG 1
289#else
290#define IP_CHECK_IF_DEBUG 0
291#endif /* (DEBUG || DEVELOPMENT) */
292static int ip_checkinterface_debug = IP_CHECK_IF_DEBUG;
293SYSCTL_INT(_net_inet_ip, OID_AUTO, checkinterface_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
294 &ip_checkinterface_debug, IP_CHECK_IF_DEBUG, "");
1c79356b 295
3e170ce0
A
296static int ip_chaining = 1;
297SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 298 &ip_chaining, 1, "Do receive side ip address based chaining");
3e170ce0
A
299
300static int ip_chainsz = 6;
301SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 302 &ip_chainsz, 1, "IP receive side max chaining");
3e170ce0 303
39037602 304#if (DEBUG || DEVELOPMENT)
3e170ce0
A
305static int ip_input_measure = 0;
306SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf,
0a7de745
A
307 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
308 &ip_input_measure, 0, sysctl_reset_ip_input_stats, "I", "Do time measurement");
3e170ce0
A
309
310static uint64_t ip_input_measure_bins = 0;
311SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins,
0a7de745
A
312 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_input_measure_bins, 0,
313 sysctl_ip_input_measure_bins, "I",
314 "bins for chaining performance data histogram");
3e170ce0
A
315
316static net_perf_t net_perf;
317SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data,
0a7de745
A
318 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
319 0, 0, sysctl_ip_input_getperf, "S,net_perf",
320 "IP input performance data (struct net_perf, net/net_perf.h)");
39037602 321#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0 322
1c79356b 323#if DIAGNOSTIC
39236c6e 324static int ipprintfs = 0;
1c79356b
A
325#endif
326
1c79356b 327struct protosw *ip_protox[IPPROTO_MAX];
b0d623f7 328
0a7de745
A
329static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr;
330static lck_grp_t *in_ifaddr_rwlock_grp;
331static lck_attr_t *in_ifaddr_rwlock_attr;
316670eb 332decl_lck_rw_data(, in_ifaddr_rwlock_data);
0a7de745 333lck_rw_t *in_ifaddr_rwlock = &in_ifaddr_rwlock_data;
b0d623f7
A
334
335/* Protected by in_ifaddr_rwlock */
0a7de745
A
336struct in_ifaddrhead in_ifaddrhead; /* first inet address */
337struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */
b0d623f7 338
0a7de745
A
339#define INADDR_NHASH 61
340static u_int32_t inaddr_nhash; /* hash table size */
341static u_int32_t inaddr_hashp; /* next largest prime */
b0d623f7 342
39236c6e 343static int ip_getstat SYSCTL_HANDLER_ARGS;
1c79356b 344struct ipstat ipstat;
fe8ab488 345SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats,
0a7de745
A
346 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
347 0, 0, ip_getstat, "S,ipstat",
348 "IP statistics (struct ipstat, netinet/ip_var.h)");
1c79356b
A
349
350#if IPCTL_DEFMTU
6d2010ae 351SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 352 &ip_mtu, 0, "Default MTU");
39236c6e 353#endif /* IPCTL_DEFMTU */
1c79356b 354
9bccf70c 355#if IPSTEALTH
0a7de745 356static int ipstealth = 0;
6d2010ae 357SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 358 &ipstealth, 0, "");
39236c6e 359#endif /* IPSTEALTH */
1c79356b 360
1c79356b 361#if DUMMYNET
91447636 362ip_dn_io_t *ip_dn_io_ptr;
39236c6e 363#endif /* DUMMYNET */
1c79356b 364
39236c6e 365SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal,
0a7de745 366 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local");
9bccf70c
A
367
368struct ip_linklocal_stat ip_linklocal_stat;
39236c6e 369SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat,
0a7de745
A
370 CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat,
371 "Number of link local packets with TTL less than 255");
9bccf70c 372
39236c6e 373SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in,
0a7de745 374 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input");
9bccf70c 375
91447636 376int ip_linklocal_in_allowbadttl = 1;
39236c6e 377SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl,
0a7de745
A
378 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0,
379 "Allow incoming link local packets with TTL less than 255");
9bccf70c 380
1c79356b 381
1c79356b
A
382/*
383 * We need to save the IP options in case a protocol wants to respond
384 * to an incoming packet over the same route if the packet got here
385 * using IP source routing. This allows connection establishment and
386 * maintenance when the remote end is on a network that is not known
387 * to us.
388 */
0a7de745
A
389static int ip_nhops = 0;
390static struct ip_srcrt {
391 struct in_addr dst; /* final destination */
392 char nop; /* one NOP to align */
393 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
394 struct in_addr route[MAX_IPOPTLEN / sizeof(struct in_addr)];
1c79356b
A
395} ip_srcrt;
396
39236c6e
A
397static void in_ifaddrhashtbl_init(void);
398static void save_rte(u_char *, struct in_addr);
399static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *);
400static void ip_forward(struct mbuf *, int, struct sockaddr_in *);
401static void frag_freef(struct ipqhead *, struct ipq *);
39236c6e 402static struct mbuf *ip_reass(struct mbuf *);
b0d623f7
A
403static void ip_fwd_route_copyout(struct ifnet *, struct route *);
404static void ip_fwd_route_copyin(struct ifnet *, struct route *);
316670eb 405static inline u_short ip_cksum(struct mbuf *, int);
1c79356b 406
39236c6e 407int ip_use_randomid = 1;
6d2010ae 408SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 409 &ip_use_randomid, 0, "Randomize IP packets IDs");
1c79356b 410
316670eb
A
411/*
412 * On platforms which require strict alignment (currently for anything but
413 * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not,
414 * copy the contents of the mbuf chain into a new chain, and free the original
415 * one. Create some head room in the first mbuf of the new chain, in case
416 * it's needed later on.
417 */
418#if defined(__i386__) || defined(__x86_64__)
0a7de745 419#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0)
316670eb 420#else /* !__i386__ && !__x86_64__ */
0a7de745
A
421#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \
422 if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \
423 struct mbuf *_n; \
424 struct ifnet *__ifp = (_ifp); \
425 atomic_add_64(&(__ifp)->if_alignerrs, 1); \
426 if (((_m)->m_flags & M_PKTHDR) && \
427 (_m)->m_pkthdr.pkt_hdr != NULL) \
428 (_m)->m_pkthdr.pkt_hdr = NULL; \
429 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \
430 if (_n == NULL) { \
431 atomic_add_32(&ipstat.ips_toosmall, 1); \
432 m_freem(_m); \
433 (_m) = NULL; \
434 _action; \
435 } else { \
436 VERIFY(_n != (_m)); \
437 (_m) = _n; \
438 } \
439 } \
316670eb
A
440} while (0)
441#endif /* !__i386__ && !__x86_64__ */
55e303ae 442
eb6b6ca3
A
443
444typedef enum ip_check_if_result {
445 IP_CHECK_IF_NONE = 0,
446 IP_CHECK_IF_OURS = 1,
447 IP_CHECK_IF_DROP = 2,
448 IP_CHECK_IF_FORWARD = 3
449} ip_check_if_result_t;
450
451static ip_check_if_result_t ip_input_check_interface(struct mbuf **, struct ip *, struct ifnet *);
452
39236c6e
A
453/*
454 * GRE input handler function, settable via ip_gre_register_input() for PPTP.
455 */
456static gre_input_func_t gre_input_func;
457
39037602
A
458static void
459ip_init_delayed(void)
460{
461 struct ifreq ifr;
462 int error;
463 struct sockaddr_in *sin;
464
465 bzero(&ifr, sizeof(ifr));
466 strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name));
467 sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr;
468 sin->sin_len = sizeof(struct sockaddr_in);
469 sin->sin_family = AF_INET;
470 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
471 error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc);
0a7de745 472 if (error) {
39037602
A
473 printf("%s: failed to initialise lo0's address, error=%d\n",
474 __func__, error);
0a7de745 475 }
39037602
A
476}
477
1c79356b
A
478/*
479 * IP initialization: fill in IP protocol switch table.
480 * All protocols not implemented in kernel go to raw IP protocol handler.
481 */
482void
39236c6e 483ip_init(struct protosw *pp, struct domain *dp)
1c79356b 484{
39236c6e 485 static int ip_initialized = 0;
2d21ac55 486 struct protosw *pr;
39236c6e 487 struct timeval tv;
2d21ac55 488 int i;
91447636 489
39236c6e 490 domain_proto_mtx_lock_assert_held();
0a7de745 491 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
1c79356b 492
39236c6e 493 /* ipq_alloc() uses mbufs for IP fragment queue structures */
0a7de745 494 _CASSERT(sizeof(struct ipq) <= _MLEN);
1c79356b 495
39236c6e
A
496 /*
497 * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is
498 * interchangeable with in_aliasreq; they must have the same size.
499 */
0a7de745 500 _CASSERT(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq));
91447636 501
0a7de745 502 if (ip_initialized) {
39236c6e 503 return;
0a7de745 504 }
39236c6e 505 ip_initialized = 1;
91447636 506
39236c6e 507 in_ifaddr_init();
91447636 508
39236c6e
A
509 in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init();
510 in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock",
511 in_ifaddr_rwlock_grp_attr);
512 in_ifaddr_rwlock_attr = lck_attr_alloc_init();
513 lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp,
514 in_ifaddr_rwlock_attr);
91447636 515
39236c6e
A
516 TAILQ_INIT(&in_ifaddrhead);
517 in_ifaddrhashtbl_init();
518
519 ip_moptions_init();
520
521 pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW);
522 if (pr == NULL) {
523 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n",
524 __func__);
525 /* NOTREACHED */
526 }
527
3e170ce0 528 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
0a7de745 529 for (i = 0; i < IPPROTO_MAX; i++) {
3e170ce0 530 ip_protox[i] = pr;
0a7de745 531 }
3e170ce0
A
532 /*
533 * Cycle through IP protocols and put them into the appropriate place
534 * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}.
535 */
536 VERIFY(dp == inetdomain && dp->dom_family == PF_INET);
537 TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) {
538 VERIFY(pr->pr_domain == dp);
539 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) {
540 /* Be careful to only index valid IP protocols. */
0a7de745 541 if (pr->pr_protocol < IPPROTO_MAX) {
3e170ce0 542 ip_protox[pr->pr_protocol] = pr;
0a7de745 543 }
3e170ce0
A
544 }
545 }
546
547 /* IP fragment reassembly queue lock */
548 ipqlock_grp_attr = lck_grp_attr_alloc_init();
549 ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr);
550 ipqlock_attr = lck_attr_alloc_init();
551 lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr);
552
553 lck_mtx_lock(&ipqlock);
554 /* Initialize IP reassembly queue. */
0a7de745 555 for (i = 0; i < IPREASS_NHASH; i++) {
3e170ce0 556 TAILQ_INIT(&ipq[i]);
0a7de745 557 }
3e170ce0
A
558
559 maxnipq = nmbclusters / 32;
560 maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */
561 ipq_updateparams();
562 lck_mtx_unlock(&ipqlock);
563
564 getmicrotime(&tv);
565 ip_id = RandomULong() ^ tv.tv_usec;
566 ip_initid();
567
568 ipf_init();
569
eb6b6ca3
A
570 PE_parse_boot_argn("ip_checkinterface", &i, sizeof(i));
571 switch (i) {
572 case IP_CHECKINTERFACE_WEAK_ES:
573 case IP_CHECKINTERFACE_HYBRID_ES:
574 case IP_CHECKINTERFACE_STRONG_ES:
575 ip_checkinterface = i;
576 break;
577 default:
578 break;
579 }
580
3e170ce0
A
581#if IPSEC
582 sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init();
583 sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat",
584 sadb_stat_mutex_grp_attr);
585 sadb_stat_mutex_attr = lck_attr_alloc_init();
586 lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp,
587 sadb_stat_mutex_attr);
588
589#endif
590 arp_init();
39037602 591 net_init_add(ip_init_delayed);
3e170ce0
A
592}
593
594/*
595 * Initialize IPv4 source address hash table.
596 */
597static void
598in_ifaddrhashtbl_init(void)
599{
600 int i, k, p;
601
0a7de745 602 if (in_ifaddrhashtbl != NULL) {
3e170ce0 603 return;
0a7de745 604 }
3e170ce0
A
605
606 PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash,
0a7de745
A
607 sizeof(inaddr_nhash));
608 if (inaddr_nhash == 0) {
3e170ce0 609 inaddr_nhash = INADDR_NHASH;
0a7de745 610 }
3e170ce0
A
611
612 MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *,
0a7de745 613 inaddr_nhash * sizeof(*in_ifaddrhashtbl),
3e170ce0 614 M_IFADDR, M_WAITOK | M_ZERO);
0a7de745 615 if (in_ifaddrhashtbl == NULL) {
3e170ce0 616 panic("in_ifaddrhashtbl_init allocation failed");
0a7de745 617 }
3e170ce0
A
618
619 /*
620 * Generate the next largest prime greater than inaddr_nhash.
621 */
622 k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2;
623 for (;;) {
624 p = 1;
625 for (i = 3; i * i <= k; i += 2) {
0a7de745 626 if (k % i == 0) {
3e170ce0 627 p = 0;
0a7de745 628 }
3e170ce0 629 }
0a7de745 630 if (p == 1) {
3e170ce0 631 break;
0a7de745 632 }
3e170ce0
A
633 k += 2;
634 }
635 inaddr_hashp = k;
636}
637
638u_int32_t
639inaddr_hashval(u_int32_t key)
640{
641 /*
642 * The hash index is the computed prime times the key modulo
643 * the hash size, as documented in "Introduction to Algorithms"
644 * (Cormen, Leiserson, Rivest).
645 */
0a7de745
A
646 if (inaddr_nhash > 1) {
647 return (key * inaddr_hashp) % inaddr_nhash;
648 } else {
649 return 0;
650 }
3e170ce0
A
651}
652
3e170ce0
A
653__private_extern__ void
654ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto,
655 ipfilter_t inject_ipfref)
656{
657 struct ipfilter *filter;
658 int seen = (inject_ipfref == NULL);
0a7de745 659 int changed_header = 0;
3e170ce0
A
660 struct ip *ip;
661 void (*pr_input)(struct mbuf *, int len);
662
663 if (!TAILQ_EMPTY(&ipv4_filters)) {
664 ipf_ref();
665 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
666 if (seen == 0) {
0a7de745 667 if ((struct ipfilter *)inject_ipfref == filter) {
3e170ce0 668 seen = 1;
0a7de745 669 }
3e170ce0
A
670 } else if (filter->ipf_filter.ipf_input) {
671 errno_t result;
672
673 if (changed_header == 0) {
674 /*
675 * Perform IP header alignment fixup,
676 * if needed, before passing packet
677 * into filter(s).
678 */
679 IP_HDR_ALIGNMENT_FIXUP(m,
680 m->m_pkthdr.rcvif, ipf_unref());
681
682 /* ipf_unref() already called */
0a7de745 683 if (m == NULL) {
3e170ce0 684 return;
0a7de745 685 }
3e170ce0
A
686
687 changed_header = 1;
688 ip = mtod(m, struct ip *);
689 ip->ip_len = htons(ip->ip_len + hlen);
690 ip->ip_off = htons(ip->ip_off);
691 ip->ip_sum = 0;
692 ip->ip_sum = ip_cksum_hdr_in(m, hlen);
693 }
694 result = filter->ipf_filter.ipf_input(
0a7de745
A
695 filter->ipf_filter.cookie, (mbuf_t *)&m,
696 hlen, proto);
3e170ce0
A
697 if (result == EJUSTRETURN) {
698 ipf_unref();
699 return;
700 }
701 if (result != 0) {
702 ipf_unref();
703 m_freem(m);
704 return;
705 }
706 }
707 }
708 ipf_unref();
709 }
710
711 /* Perform IP header alignment fixup (post-filters), if needed */
0a7de745 712 IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return );
3e170ce0 713
3e170ce0
A
714 ip = mtod(m, struct ip *);
715
716 if (changed_header) {
717 ip->ip_len = ntohs(ip->ip_len) - hlen;
718 ip->ip_off = ntohs(ip->ip_off);
719 }
720
eb6b6ca3
A
721 /*
722 * If there isn't a specific lock for the protocol
723 * we're about to call, use the generic lock for AF_INET.
724 * otherwise let the protocol deal with its own locking
725 */
3e170ce0
A
726 if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) {
727 m_freem(m);
728 } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) {
729 lck_mtx_lock(inet_domain_mutex);
730 pr_input(m, hlen);
731 lck_mtx_unlock(inet_domain_mutex);
732 } else {
733 pr_input(m, hlen);
734 }
735}
736
737struct pktchain_elm {
0a7de745
A
738 struct mbuf *pkte_head;
739 struct mbuf *pkte_tail;
740 struct in_addr pkte_saddr;
741 struct in_addr pkte_daddr;
742 uint16_t pkte_npkts;
743 uint16_t pkte_proto;
744 uint32_t pkte_nbytes;
3e170ce0
A
745};
746
747typedef struct pktchain_elm pktchain_elm_t;
748
749/* Store upto PKTTBL_SZ unique flows on the stack */
0a7de745 750#define PKTTBL_SZ 7
3e170ce0
A
751
752static struct mbuf *
753ip_chain_insert(struct mbuf *packet, pktchain_elm_t *tbl)
754{
0a7de745
A
755 struct ip* ip;
756 int pkttbl_idx = 0;
3e170ce0
A
757
758 ip = mtod(packet, struct ip*);
759
760 /* reusing the hash function from inaddr_hashval */
761 pkttbl_idx = inaddr_hashval(ntohs(ip->ip_src.s_addr)) % PKTTBL_SZ;
762 if (tbl[pkttbl_idx].pkte_head == NULL) {
763 tbl[pkttbl_idx].pkte_head = packet;
764 tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr;
765 tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr;
766 tbl[pkttbl_idx].pkte_proto = ip->ip_p;
767 } else {
768 if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) &&
769 (ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) &&
770 (ip->ip_p == tbl[pkttbl_idx].pkte_proto)) {
771 } else {
0a7de745 772 return packet;
3e170ce0
A
773 }
774 }
0a7de745 775 if (tbl[pkttbl_idx].pkte_tail != NULL) {
3e170ce0 776 mbuf_setnextpkt(tbl[pkttbl_idx].pkte_tail, packet);
0a7de745 777 }
3e170ce0
A
778
779 tbl[pkttbl_idx].pkte_tail = packet;
780 tbl[pkttbl_idx].pkte_npkts += 1;
781 tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len;
0a7de745 782 return NULL;
3e170ce0
A
783}
784
785/* args is a dummy variable here for backward compatibility */
786static void
787ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args)
788{
789 int i = 0;
790
791 for (i = 0; i < PKTTBL_SZ; i++) {
792 if (tbl[i].pkte_head != NULL) {
793 struct mbuf *m = tbl[i].pkte_head;
f427ee49
A
794 ip_input_second_pass(m, m->m_pkthdr.rcvif,
795 tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args);
3e170ce0 796
0a7de745 797 if (tbl[i].pkte_npkts > 2) {
3e170ce0 798 ipstat.ips_rxc_chainsz_gt2++;
0a7de745
A
799 }
800 if (tbl[i].pkte_npkts > 4) {
3e170ce0 801 ipstat.ips_rxc_chainsz_gt4++;
0a7de745 802 }
39037602 803#if (DEBUG || DEVELOPMENT)
0a7de745 804 if (ip_input_measure) {
3e170ce0 805 net_perf_histogram(&net_perf, tbl[i].pkte_npkts);
0a7de745 806 }
39037602 807#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
808 tbl[i].pkte_head = tbl[i].pkte_tail = NULL;
809 tbl[i].pkte_npkts = 0;
810 tbl[i].pkte_nbytes = 0;
811 /* no need to initialize address and protocol in tbl */
812 }
813 }
814}
815
816static void
817ip_input_cpout_args(struct ip_fw_in_args *args, struct ip_fw_args *args1,
818 boolean_t *done_init)
819{
820 if (*done_init == FALSE) {
821 bzero(args1, sizeof(struct ip_fw_args));
822 *done_init = TRUE;
823 }
3e170ce0 824 args1->fwa_pf_rule = args->fwai_pf_rule;
3e170ce0
A
825}
826
827static void
828ip_input_cpin_args(struct ip_fw_args *args1, struct ip_fw_in_args *args)
829{
3e170ce0 830 args->fwai_pf_rule = args1->fwa_pf_rule;
3e170ce0
A
831}
832
833typedef enum {
834 IPINPUT_DOCHAIN = 0,
835 IPINPUT_DONTCHAIN,
836 IPINPUT_FREED,
837 IPINPUT_DONE
838} ipinput_chain_ret_t;
839
840static void
841ip_input_update_nstat(struct ifnet *ifp, struct in_addr src_ip,
842 u_int32_t packets, u_int32_t bytes)
843{
844 if (nstat_collect) {
845 struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp,
846 src_ip);
847 if (rt != NULL) {
848 nstat_route_rx(rt, packets, bytes, 0);
849 rtfree(rt);
850 }
851 }
852}
853
854static void
855ip_input_dispatch_chain(struct mbuf *m)
856{
857 struct mbuf *tmp_mbuf = m;
858 struct mbuf *nxt_mbuf = NULL;
859 struct ip *ip = NULL;
860 unsigned int hlen;
861
862 ip = mtod(tmp_mbuf, struct ip *);
863 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
eb6b6ca3 864 while (tmp_mbuf != NULL) {
3e170ce0
A
865 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
866 mbuf_setnextpkt(tmp_mbuf, NULL);
f427ee49 867 ip_proto_dispatch_in(tmp_mbuf, hlen, ip->ip_p, 0);
3e170ce0
A
868 tmp_mbuf = nxt_mbuf;
869 if (tmp_mbuf) {
870 ip = mtod(tmp_mbuf, struct ip *);
871 /* first mbuf of chain already has adjusted ip_len */
872 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
873 ip->ip_len -= hlen;
874 }
875 }
876}
877
878static void
879ip_input_setdst_chain(struct mbuf *m, uint32_t ifindex, struct in_ifaddr *ia)
880{
881 struct mbuf *tmp_mbuf = m;
882
eb6b6ca3 883 while (tmp_mbuf != NULL) {
3e170ce0
A
884 ip_setdstifaddr_info(tmp_mbuf, ifindex, ia);
885 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
886 }
887}
888
5ba3f43e
A
889static void
890ip_input_adjust(struct mbuf *m, struct ip *ip, struct ifnet *inifp)
891{
892 boolean_t adjust = TRUE;
893
894 ASSERT(m_pktlen(m) > ip->ip_len);
895
896 /*
897 * Invalidate hardware checksum info if ip_adj_clear_hwcksum
898 * is set; useful to handle buggy drivers. Note that this
899 * should not be enabled by default, as we may get here due
900 * to link-layer padding.
901 */
902 if (ip_adj_clear_hwcksum &&
903 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
904 !(inifp->if_flags & IFF_LOOPBACK) &&
905 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
906 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
907 m->m_pkthdr.csum_data = 0;
908 ipstat.ips_adj_hwcsum_clr++;
909 }
910
911 /*
912 * If partial checksum information is available, subtract
913 * out the partial sum of postpended extraneous bytes, and
914 * update the checksum metadata accordingly. By doing it
915 * here, the upper layer transport only needs to adjust any
916 * prepended extraneous bytes (else it will do both.)
917 */
918 if (ip_adj_partial_sum &&
0a7de745
A
919 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
920 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
921 m->m_pkthdr.csum_rx_val = m_adj_sum16(m,
922 m->m_pkthdr.csum_rx_start, m->m_pkthdr.csum_rx_start,
923 (ip->ip_len - m->m_pkthdr.csum_rx_start),
924 m->m_pkthdr.csum_rx_val);
925 } else if ((m->m_pkthdr.csum_flags &
0a7de745
A
926 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
927 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
928 /*
929 * If packet has partial checksum info and we decided not
930 * to subtract the partial sum of postpended extraneous
931 * bytes here (not the default case), leave that work to
932 * be handled by the other layers. For now, only TCP, UDP
933 * layers are capable of dealing with this. For all other
934 * protocols (including fragments), trim and ditch the
935 * partial sum as those layers might not implement partial
936 * checksumming (or adjustment) at all.
937 */
938 if ((ip->ip_off & (IP_MF | IP_OFFMASK)) == 0 &&
939 (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_UDP)) {
940 adjust = FALSE;
941 } else {
942 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
943 m->m_pkthdr.csum_data = 0;
944 ipstat.ips_adj_hwcsum_clr++;
945 }
946 }
947
948 if (adjust) {
949 ipstat.ips_adj++;
950 if (m->m_len == m->m_pkthdr.len) {
951 m->m_len = ip->ip_len;
952 m->m_pkthdr.len = ip->ip_len;
953 } else {
954 m_adj(m, ip->ip_len - m->m_pkthdr.len);
955 }
956 }
957}
958
3e170ce0
A
959/*
960 * First pass does all essential packet validation and places on a per flow
961 * queue for doing operations that have same outcome for all packets of a flow.
3e170ce0
A
962 */
963static ipinput_chain_ret_t
f427ee49 964ip_input_first_pass(struct mbuf *m, struct ip_fw_in_args *args, struct mbuf **modm)
3e170ce0 965{
0a7de745
A
966 struct ip *ip;
967 struct ifnet *inifp;
968 unsigned int hlen;
969 int retval = IPINPUT_DOCHAIN;
970 int len = 0;
971 struct in_addr src_ip;
f427ee49 972#if DUMMYNET
0a7de745
A
973 struct m_tag *copy;
974 struct m_tag *p;
975 boolean_t delete = FALSE;
976 struct ip_fw_args args1;
977 boolean_t init = FALSE;
f427ee49 978#endif /* DUMMYNET */
3e170ce0
A
979 ipfilter_t inject_filter_ref = NULL;
980
3e170ce0
A
981 /* Check if the mbuf is still valid after interface filter processing */
982 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
983 inifp = mbuf_pkthdr_rcvif(m);
984 VERIFY(inifp != NULL);
985
986 /* Perform IP header alignment fixup, if needed */
987 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
988
989 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
990
f427ee49 991#if DUMMYNET
3e170ce0
A
992 /*
993 * Don't bother searching for tag(s) if there's none.
994 */
0a7de745 995 if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
3e170ce0 996 goto ipfw_tags_done;
0a7de745 997 }
3e170ce0
A
998
999 /* Grab info from mtags prepended to the chain */
1000 p = m_tag_first(m);
1001 while (p) {
1002 if (p->m_tag_id == KERNEL_MODULE_TAG_ID) {
3e170ce0
A
1003 if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) {
1004 struct dn_pkt_tag *dn_tag;
1005
0a7de745 1006 dn_tag = (struct dn_pkt_tag *)(p + 1);
3e170ce0
A
1007 args->fwai_pf_rule = dn_tag->dn_pf_rule;
1008 delete = TRUE;
1009 }
3e170ce0
A
1010
1011 if (delete) {
1012 copy = p;
1013 p = m_tag_next(m, p);
1014 m_tag_delete(m, copy);
0a7de745 1015 } else {
3e170ce0
A
1016 p = m_tag_next(m, p);
1017 }
1018 } else {
1019 p = m_tag_next(m, p);
1020 }
1021 }
1022
1023#if DIAGNOSTIC
0a7de745 1024 if (m == NULL || !(m->m_flags & M_PKTHDR)) {
3e170ce0 1025 panic("ip_input no HDR");
0a7de745 1026 }
3e170ce0
A
1027#endif
1028
f427ee49 1029 if (args->fwai_pf_rule) {
3e170ce0
A
1030 /* dummynet already filtered us */
1031 ip = mtod(m, struct ip *);
1032 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1033 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1034 if (args->fwai_pf_rule) {
3e170ce0 1035 goto check_with_pf;
0a7de745 1036 }
3e170ce0 1037 }
3e170ce0 1038ipfw_tags_done:
f427ee49 1039#endif /* DUMMYNET */
3e170ce0
A
1040
1041 /*
1042 * No need to process packet twice if we've already seen it.
1043 */
0a7de745 1044 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
3e170ce0 1045 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1046 }
3e170ce0
A
1047 if (inject_filter_ref != NULL) {
1048 ip = mtod(m, struct ip *);
1049 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1050
1051 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1052 struct ip *, ip, struct ifnet *, inifp,
1053 struct ip *, ip, struct ip6_hdr *, NULL);
1054
1055 ip->ip_len = ntohs(ip->ip_len) - hlen;
1056 ip->ip_off = ntohs(ip->ip_off);
1057 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
0a7de745 1058 return IPINPUT_DONE;
3e170ce0
A
1059 }
1060
0a7de745 1061 if (m->m_pkthdr.len < sizeof(struct ip)) {
3e170ce0
A
1062 OSAddAtomic(1, &ipstat.ips_total);
1063 OSAddAtomic(1, &ipstat.ips_tooshort);
1064 m_freem(m);
0a7de745 1065 return IPINPUT_FREED;
3e170ce0
A
1066 }
1067
0a7de745
A
1068 if (m->m_len < sizeof(struct ip) &&
1069 (m = m_pullup(m, sizeof(struct ip))) == NULL) {
3e170ce0
A
1070 OSAddAtomic(1, &ipstat.ips_total);
1071 OSAddAtomic(1, &ipstat.ips_toosmall);
0a7de745 1072 return IPINPUT_FREED;
3e170ce0
A
1073 }
1074
1075 ip = mtod(m, struct ip *);
1076 *modm = m;
1077
1078 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1079 ip->ip_p, ip->ip_off, ip->ip_len);
1080
1081 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1082 OSAddAtomic(1, &ipstat.ips_total);
1083 OSAddAtomic(1, &ipstat.ips_badvers);
1084 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1085 m_freem(m);
0a7de745 1086 return IPINPUT_FREED;
3e170ce0
A
1087 }
1088
1089 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
0a7de745 1090 if (hlen < sizeof(struct ip)) {
3e170ce0
A
1091 OSAddAtomic(1, &ipstat.ips_total);
1092 OSAddAtomic(1, &ipstat.ips_badhlen);
1093 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1094 m_freem(m);
0a7de745 1095 return IPINPUT_FREED;
3e170ce0
A
1096 }
1097
1098 if (hlen > m->m_len) {
1099 if ((m = m_pullup(m, hlen)) == NULL) {
1100 OSAddAtomic(1, &ipstat.ips_total);
1101 OSAddAtomic(1, &ipstat.ips_badhlen);
1102 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
0a7de745 1103 return IPINPUT_FREED;
3e170ce0
A
1104 }
1105 ip = mtod(m, struct ip *);
1106 *modm = m;
1107 }
1108
1109 /* 127/8 must not appear on wire - RFC1122 */
1110 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1111 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1112 /*
1113 * Allow for the following exceptions:
1114 *
1115 * 1. If the packet was sent to loopback (i.e. rcvif
1116 * would have been set earlier at output time.)
1117 *
1118 * 2. If the packet was sent out on loopback from a local
1119 * source address which belongs to a non-loopback
1120 * interface (i.e. rcvif may not necessarily be a
1121 * loopback interface, hence the test for PKTF_LOOP.)
1122 * Unlike IPv6, there is no interface scope ID, and
1123 * therefore we don't care so much about PKTF_IFINFO.
1124 */
1125 if (!(inifp->if_flags & IFF_LOOPBACK) &&
0a7de745 1126 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
3e170ce0
A
1127 OSAddAtomic(1, &ipstat.ips_total);
1128 OSAddAtomic(1, &ipstat.ips_badaddr);
1129 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1130 m_freem(m);
0a7de745 1131 return IPINPUT_FREED;
3e170ce0
A
1132 }
1133 }
1134
1135 /* IPv4 Link-Local Addresses as defined in RFC3927 */
1136 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
1137 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1138 ip_linklocal_stat.iplls_in_total++;
1139 if (ip->ip_ttl != MAXTTL) {
1140 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
1141 /* Silently drop link local traffic with bad TTL */
1142 if (!ip_linklocal_in_allowbadttl) {
1143 OSAddAtomic(1, &ipstat.ips_total);
1144 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1145 m_freem(m);
0a7de745 1146 return IPINPUT_FREED;
3e170ce0
A
1147 }
1148 }
1149 }
1150
1151 if (ip_cksum(m, hlen)) {
1152 OSAddAtomic(1, &ipstat.ips_total);
1153 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1154 m_freem(m);
0a7de745 1155 return IPINPUT_FREED;
3e170ce0
A
1156 }
1157
1158 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1159 struct ip *, ip, struct ifnet *, inifp,
1160 struct ip *, ip, struct ip6_hdr *, NULL);
1161
1162 /*
1163 * Convert fields to host representation.
1164 */
1165#if BYTE_ORDER != BIG_ENDIAN
1166 NTOHS(ip->ip_len);
1167#endif
1168
1169 if (ip->ip_len < hlen) {
1170 OSAddAtomic(1, &ipstat.ips_total);
1171 OSAddAtomic(1, &ipstat.ips_badlen);
1172 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1173 m_freem(m);
0a7de745 1174 return IPINPUT_FREED;
3e170ce0
A
1175 }
1176
1177#if BYTE_ORDER != BIG_ENDIAN
1178 NTOHS(ip->ip_off);
1179#endif
1180
1181 /*
1182 * Check that the amount of data in the buffers
1183 * is as at least much as the IP header would have us expect.
1184 * Trim mbufs if longer than we expect.
1185 * Drop packet if shorter than we expect.
1186 */
1187 if (m->m_pkthdr.len < ip->ip_len) {
1188 OSAddAtomic(1, &ipstat.ips_total);
1189 OSAddAtomic(1, &ipstat.ips_tooshort);
1190 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1191 m_freem(m);
0a7de745 1192 return IPINPUT_FREED;
3e170ce0
A
1193 }
1194
1195 if (m->m_pkthdr.len > ip->ip_len) {
5ba3f43e 1196 ip_input_adjust(m, ip, inifp);
3e170ce0
A
1197 }
1198
3e170ce0
A
1199 /* for netstat route statistics */
1200 src_ip = ip->ip_src;
1201 len = m->m_pkthdr.len;
1202
1203#if DUMMYNET
1204check_with_pf:
f427ee49 1205#endif /* DUMMYNET */
3e170ce0
A
1206#if PF
1207 /* Invoke inbound packet filter */
1208 if (PF_IS_ENABLED) {
1209 int error;
1210 ip_input_cpout_args(args, &args1, &init);
5ba3f43e
A
1211 ip = mtod(m, struct ip *);
1212 src_ip = ip->ip_src;
3e170ce0
A
1213
1214#if DUMMYNET
1215 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1);
1216#else
1217 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1218#endif /* DUMMYNET */
1219 if (error != 0 || m == NULL) {
1220 if (m != NULL) {
1221 panic("%s: unexpected packet %p\n",
1222 __func__, m);
1223 /* NOTREACHED */
1224 }
1225 /* Already freed by callee */
1226 ip_input_update_nstat(inifp, src_ip, 1, len);
1227 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1228 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1229 return IPINPUT_FREED;
3e170ce0
A
1230 }
1231 ip = mtod(m, struct ip *);
1232 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1233 *modm = m;
1234 ip_input_cpin_args(&args1, args);
1235 }
1236#endif /* PF */
1237
1238#if IPSEC
1239 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
1240 retval = IPINPUT_DONTCHAIN; /* XXX scope for chaining here? */
1241 goto pass;
1242 }
1243#endif
1244
f427ee49 1245#if IPSEC
3e170ce0
A
1246pass:
1247#endif
1248 /*
1249 * Process options and, if not destined for us,
1250 * ship it on. ip_dooptions returns 1 when an
1251 * error was detected (causing an icmp message
1252 * to be sent and the original packet to be freed).
1253 */
0a7de745 1254 ip_nhops = 0; /* for source routed packets */
0a7de745 1255 if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
3e170ce0
A
1256 ip_input_update_nstat(inifp, src_ip, 1, len);
1257 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1258 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1259 return IPINPUT_FREED;
3e170ce0
A
1260 }
1261
1262 /*
f427ee49 1263 * Don't chain fragmented packets
3e170ce0 1264 */
0a7de745
A
1265 if (ip->ip_off & ~(IP_DF | IP_RF)) {
1266 return IPINPUT_DONTCHAIN;
1267 }
3e170ce0
A
1268
1269 /* Allow DHCP/BootP responses through */
1270 if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
0a7de745 1271 hlen == sizeof(struct ip) && ip->ip_p == IPPROTO_UDP) {
3e170ce0
A
1272 struct udpiphdr *ui;
1273
0a7de745
A
1274 if (m->m_len < sizeof(struct udpiphdr) &&
1275 (m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
3e170ce0
A
1276 OSAddAtomic(1, &udpstat.udps_hdrops);
1277 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1278 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1279 return IPINPUT_FREED;
3e170ce0
A
1280 }
1281 *modm = m;
1282 ui = mtod(m, struct udpiphdr *);
1283 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1284 ip_setdstifaddr_info(m, inifp->if_index, NULL);
0a7de745 1285 return IPINPUT_DONTCHAIN;
3e170ce0
A
1286 }
1287 }
1288
1289 /* Avoid chaining raw sockets as ipsec checks occur later for them */
0a7de745
A
1290 if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) {
1291 return IPINPUT_DONTCHAIN;
1292 }
3e170ce0 1293
0a7de745 1294 return retval;
3e170ce0
A
1295#if !defined(__i386__) && !defined(__x86_64__)
1296bad:
1297 m_freem(m);
0a7de745 1298 return IPINPUT_FREED;
3e170ce0
A
1299#endif
1300}
1301
eb6b6ca3
A
1302/*
1303 * Because the call to m_pullup() may freem the mbuf, the function frees the mbuf packet
1304 * chain before it return IP_CHECK_IF_DROP
1305 */
1306static ip_check_if_result_t
1307ip_input_check_interface(struct mbuf **mp, struct ip *ip, struct ifnet *inifp)
1308{
1309 struct mbuf *m = *mp;
1310 struct in_ifaddr *ia = NULL;
1311 struct in_ifaddr *best_ia = NULL;
1312 struct ifnet *match_ifp = NULL;
1313 ip_check_if_result_t result = IP_CHECK_IF_NONE;
1314
1315 /*
1316 * Host broadcast and all network broadcast addresses are always a match
1317 */
1318 if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
1319 ip->ip_dst.s_addr == INADDR_ANY) {
1320 ip_input_setdst_chain(m, inifp->if_index, NULL);
1321 return IP_CHECK_IF_OURS;
1322 }
1323
1324 /*
1325 * Check for a match in the hash bucket.
1326 */
1327 lck_rw_lock_shared(in_ifaddr_rwlock);
1328 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
1329 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) {
1330 best_ia = ia;
1331 match_ifp = best_ia->ia_ifp;
1332
1333 if (ia->ia_ifp == inifp || (inifp->if_flags & IFF_LOOPBACK) ||
1334 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1335 /*
1336 * A locally originated packet or packet from the loopback
1337 * interface is always an exact interface address match
1338 */
1339 match_ifp = inifp;
1340 break;
1341 }
1342 /*
1343 * Continue the loop in case there's a exact match with another
1344 * interface
1345 */
1346 }
1347 }
1348 if (best_ia != NULL) {
1349 if (match_ifp != inifp && ipforwarding == 0 &&
1350 ((ip_checkinterface == IP_CHECKINTERFACE_HYBRID_ES &&
1351 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
1352 match_ifp->if_family == IFNET_FAMILY_UTUN)) ||
1353 ip_checkinterface == IP_CHECKINTERFACE_STRONG_ES)) {
1354 /*
1355 * Drop when interface address check is strict and forwarding
1356 * is disabled
1357 */
1358 result = IP_CHECK_IF_DROP;
1359 } else {
1360 result = IP_CHECK_IF_OURS;
1361 ip_input_setdst_chain(m, 0, best_ia);
1362 }
1363 }
1364 lck_rw_done(in_ifaddr_rwlock);
1365
1366 if (result == IP_CHECK_IF_NONE && (inifp->if_flags & IFF_BROADCAST)) {
1367 /*
1368 * Check for broadcast addresses.
1369 *
1370 * Only accept broadcast packets that arrive via the matching
1371 * interface. Reception of forwarded directed broadcasts would be
1372 * handled via ip_forward() and ether_frameout() with the loopback
1373 * into the stack for SIMPLEX interfaces handled by ether_frameout().
1374 */
1375 struct ifaddr *ifa;
1376
1377 ifnet_lock_shared(inifp);
1378 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
1379 if (ifa->ifa_addr->sa_family != AF_INET) {
1380 continue;
1381 }
1382 ia = ifatoia(ifa);
1383 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr ||
1384 ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
1385 ip_input_setdst_chain(m, 0, ia);
1386 result = IP_CHECK_IF_OURS;
1387 match_ifp = inifp;
1388 break;
1389 }
1390 }
1391 ifnet_lock_done(inifp);
1392 }
1393
1394 /* Allow DHCP/BootP responses through */
1395 if (result == IP_CHECK_IF_NONE && (inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1396 ip->ip_p == IPPROTO_UDP && (IP_VHL_HL(ip->ip_vhl) << 2) == sizeof(struct ip)) {
1397 struct udpiphdr *ui;
1398
1399 if (m->m_len < sizeof(struct udpiphdr)) {
1400 if ((m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
1401 OSAddAtomic(1, &udpstat.udps_hdrops);
1402 *mp = NULL;
1403 return IP_CHECK_IF_DROP;
1404 }
1405 /*
1406 * m_pullup can return a different mbuf
1407 */
1408 *mp = m;
1409 ip = mtod(m, struct ip *);
1410 }
1411 ui = mtod(m, struct udpiphdr *);
1412 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1413 ASSERT(m->m_nextpkt == NULL);
1414 ip_setdstifaddr_info(m, inifp->if_index, NULL);
1415 result = IP_CHECK_IF_OURS;
1416 match_ifp = inifp;
1417 }
1418 }
1419
1420 if (result == IP_CHECK_IF_NONE) {
1421 if (ipforwarding == 0) {
1422 result = IP_CHECK_IF_DROP;
1423 } else {
1424 result = IP_CHECK_IF_FORWARD;
1425 ip_input_setdst_chain(m, inifp->if_index, NULL);
1426 }
1427 }
1428
1429 if (result == IP_CHECK_IF_OURS && match_ifp != inifp) {
1430 ipstat.ips_rcv_if_weak_match++;
1431
1432 /* Logging is too noisy when forwarding is enabled */
1433 if (ip_checkinterface_debug != 0 && ipforwarding == 0) {
1434 char src_str[MAX_IPv4_STR_LEN];
1435 char dst_str[MAX_IPv4_STR_LEN];
1436
1437 inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1438 inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
1439 os_log_info(OS_LOG_DEFAULT,
1440 "%s: weak ES interface match to %s for packet from %s to %s proto %u received via %s",
1441 __func__, best_ia->ia_ifp->if_xname, src_str, dst_str, ip->ip_p, inifp->if_xname);
1442 }
1443 } else if (result == IP_CHECK_IF_DROP) {
1444 if (ip_checkinterface_debug > 0) {
1445 char src_str[MAX_IPv4_STR_LEN];
1446 char dst_str[MAX_IPv4_STR_LEN];
1447
1448 inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1449 inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
f427ee49 1450 os_log(OS_LOG_DEFAULT,
eb6b6ca3
A
1451 "%s: no interface match for packet from %s to %s proto %u received via %s",
1452 __func__, src_str, dst_str, ip->ip_p, inifp->if_xname);
1453 }
1454 struct mbuf *tmp_mbuf = m;
1455 while (tmp_mbuf != NULL) {
1456 ipstat.ips_rcv_if_no_match++;
1457 tmp_mbuf = tmp_mbuf->m_nextpkt;
1458 }
1459 m_freem_list(m);
1460 *mp = NULL;
1461 }
1462
1463 return result;
1464}
1465
3e170ce0 1466static void
f427ee49
A
1467ip_input_second_pass(struct mbuf *m, struct ifnet *inifp,
1468 int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args)
3e170ce0 1469{
0a7de745 1470 struct mbuf *tmp_mbuf = NULL;
0a7de745 1471 unsigned int hlen;
3e170ce0 1472
3e170ce0 1473#pragma unused (args)
3e170ce0
A
1474
1475 struct ip *ip = mtod(m, struct ip *);
1476 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1477
1478 OSAddAtomic(npkts_in_chain, &ipstat.ips_total);
1479
1480 /*
1481 * Naively assume we can attribute inbound data to the route we would
1482 * use to send to this destination. Asymmetric routing breaks this
1483 * assumption, but it still allows us to account for traffic from
1484 * a remote node in the routing table.
1485 * this has a very significant performance impact so we bypass
1486 * if nstat_collect is disabled. We may also bypass if the
1487 * protocol is tcp in the future because tcp will have a route that
1488 * we can use to attribute the data to. That does mean we would not
1489 * account for forwarded tcp traffic.
1490 */
1491 ip_input_update_nstat(inifp, ip->ip_src, npkts_in_chain,
1492 bytes_in_chain);
1493
3e170ce0
A
1494 /*
1495 * Check our list of addresses, to see if the packet is for us.
1496 * If we don't have any addresses, assume any unicast packet
1497 * we receive might be for us (and let the upper layers deal
1498 * with it).
1499 */
1500 tmp_mbuf = m;
1501 if (TAILQ_EMPTY(&in_ifaddrhead)) {
eb6b6ca3 1502 while (tmp_mbuf != NULL) {
0a7de745 1503 if (!(tmp_mbuf->m_flags & (M_MCAST | M_BCAST))) {
3e170ce0
A
1504 ip_setdstifaddr_info(tmp_mbuf, inifp->if_index,
1505 NULL);
1506 }
1507 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
1508 }
1509 goto ours;
1510 }
3e170ce0
A
1511
1512 /*
1513 * Enable a consistency check between the destination address
1514 * and the arrival interface for a unicast packet (the RFC 1122
1515 * strong ES model) if IP forwarding is disabled and the packet
f427ee49 1516 * is not locally generated
3e170ce0
A
1517 *
1518 * XXX - Checking also should be disabled if the destination
1519 * address is ipnat'ed to a different interface.
1520 *
1521 * XXX - Checking is incompatible with IP aliases added
1522 * to the loopback interface instead of the interface where
1523 * the packets are received.
1524 */
eb6b6ca3
A
1525 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1526 ip_check_if_result_t ip_check_if_result = IP_CHECK_IF_NONE;
3e170ce0 1527
eb6b6ca3
A
1528 ip_check_if_result = ip_input_check_interface(&m, ip, inifp);
1529 ASSERT(ip_check_if_result != IP_CHECK_IF_NONE);
1530 if (ip_check_if_result == IP_CHECK_IF_OURS) {
3e170ce0 1531 goto ours;
eb6b6ca3
A
1532 } else if (ip_check_if_result == IP_CHECK_IF_DROP) {
1533 return;
3e170ce0 1534 }
eb6b6ca3 1535 } else {
3e170ce0
A
1536 struct in_multi *inm;
1537 /*
1538 * See if we belong to the destination multicast group on the
1539 * arrival interface.
1540 */
1541 in_multihead_lock_shared();
1542 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
1543 in_multihead_lock_done();
1544 if (inm == NULL) {
1545 OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember);
1546 m_freem_list(m);
1547 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1548 return;
1549 }
1550 ip_input_setdst_chain(m, inifp->if_index, NULL);
1551 INM_REMREF(inm);
1552 goto ours;
1553 }
1554
3e170ce0
A
1555 tmp_mbuf = m;
1556 struct mbuf *nxt_mbuf = NULL;
eb6b6ca3 1557 while (tmp_mbuf != NULL) {
3e170ce0
A
1558 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
1559 /*
1560 * Not for us; forward if possible and desirable.
1561 */
1562 mbuf_setnextpkt(tmp_mbuf, NULL);
1563 if (ipforwarding == 0) {
1564 OSAddAtomic(1, &ipstat.ips_cantforward);
1565 m_freem(tmp_mbuf);
1566 } else {
3e170ce0 1567 ip_forward(tmp_mbuf, 0, NULL);
3e170ce0
A
1568 }
1569 tmp_mbuf = nxt_mbuf;
1570 }
1571 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1572 return;
1573ours:
eb6b6ca3 1574 ip = mtod(m, struct ip *); /* in case it changed */
3e170ce0 1575 /*
f427ee49 1576 * If offset is set, must reassemble.
3e170ce0
A
1577 */
1578 if (ip->ip_off & ~(IP_DF | IP_RF)) {
1579 VERIFY(npkts_in_chain == 1);
3e170ce0 1580 m = ip_reass(m);
0a7de745 1581 if (m == NULL) {
3e170ce0 1582 return;
0a7de745 1583 }
3e170ce0
A
1584 ip = mtod(m, struct ip *);
1585 /* Get the header length of the reassembled packet */
1586 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
3e170ce0
A
1587 }
1588
1589 /*
1590 * Further protocols expect the packet length to be w/o the
1591 * IP header.
1592 */
1593 ip->ip_len -= hlen;
1594
3e170ce0 1595#if IPSEC
39236c6e 1596 /*
3e170ce0
A
1597 * enforce IPsec policy checking if we are seeing last header.
1598 * note that we do not visit this with protocols with pcb layer
1599 * code - like udp/tcp/raw ip.
39236c6e 1600 */
3e170ce0
A
1601 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
1602 VERIFY(npkts_in_chain == 1);
1603 if (ipsec4_in_reject(m, NULL)) {
1604 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1605 goto bad;
91447636 1606 }
39236c6e 1607 }
3e170ce0 1608#endif /* IPSEC */
91447636 1609
3e170ce0
A
1610 /*
1611 * Switch out to protocol's input routine.
1612 */
1613 OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered);
39236c6e 1614
3e170ce0 1615 ip_input_dispatch_chain(m);
b0d623f7 1616
3e170ce0
A
1617 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1618 return;
1619bad:
1620 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1621 m_freem(m);
b0d623f7
A
1622}
1623
316670eb 1624void
3e170ce0 1625ip_input_process_list(struct mbuf *packet_list)
91447636 1626{
0a7de745 1627 pktchain_elm_t pktchain_tbl[PKTTBL_SZ];
3e170ce0 1628
0a7de745
A
1629 struct mbuf *packet = NULL;
1630 struct mbuf *modm = NULL; /* modified mbuf */
1631 int retval = 0;
39037602 1632#if (DEBUG || DEVELOPMENT)
3e170ce0 1633 struct timeval start_tv;
39037602 1634#endif /* (DEBUG || DEVELOPMENT) */
0a7de745 1635 int num_pkts = 0;
3e170ce0
A
1636 int chain = 0;
1637 struct ip_fw_in_args args;
1638
1639 if (ip_chaining == 0) {
1640 struct mbuf *m = packet_list;
39037602 1641#if (DEBUG || DEVELOPMENT)
0a7de745 1642 if (ip_input_measure) {
3e170ce0 1643 net_perf_start_time(&net_perf, &start_tv);
0a7de745 1644 }
39037602
A
1645#endif /* (DEBUG || DEVELOPMENT) */
1646
3e170ce0
A
1647 while (m) {
1648 packet_list = mbuf_nextpkt(m);
1649 mbuf_setnextpkt(m, NULL);
1650 ip_input(m);
1651 m = packet_list;
1652 num_pkts++;
1653 }
39037602 1654#if (DEBUG || DEVELOPMENT)
0a7de745 1655 if (ip_input_measure) {
3e170ce0 1656 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
0a7de745 1657 }
39037602 1658#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
1659 return;
1660 }
39037602 1661#if (DEBUG || DEVELOPMENT)
0a7de745 1662 if (ip_input_measure) {
3e170ce0 1663 net_perf_start_time(&net_perf, &start_tv);
0a7de745 1664 }
39037602 1665#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
1666
1667 bzero(&pktchain_tbl, sizeof(pktchain_tbl));
1668restart_list_process:
1669 chain = 0;
1670 for (packet = packet_list; packet; packet = packet_list) {
1671 packet_list = mbuf_nextpkt(packet);
1672 mbuf_setnextpkt(packet, NULL);
1673
1674 num_pkts++;
1675 modm = NULL;
0a7de745 1676 bzero(&args, sizeof(args));
3e170ce0 1677
f427ee49 1678 retval = ip_input_first_pass(packet, &args, &modm);
3e170ce0
A
1679
1680 if (retval == IPINPUT_DOCHAIN) {
0a7de745 1681 if (modm) {
3e170ce0 1682 packet = modm;
0a7de745 1683 }
3e170ce0
A
1684 packet = ip_chain_insert(packet, &pktchain_tbl[0]);
1685 if (packet == NULL) {
1686 ipstat.ips_rxc_chained++;
1687 chain++;
0a7de745 1688 if (chain > ip_chainsz) {
3e170ce0 1689 break;
0a7de745 1690 }
3e170ce0
A
1691 } else {
1692 ipstat.ips_rxc_collisions++;
1693 break;
316670eb 1694 }
3e170ce0
A
1695 } else if (retval == IPINPUT_DONTCHAIN) {
1696 /* in order to preserve order, exit from chaining */
0a7de745 1697 if (modm) {
3e170ce0 1698 packet = modm;
0a7de745 1699 }
3e170ce0
A
1700 ipstat.ips_rxc_notchain++;
1701 break;
1702 } else {
1703 /* packet was freed or delivered, do nothing. */
91447636 1704 }
91447636 1705 }
316670eb 1706
3e170ce0 1707 /* do second pass here for pktchain_tbl */
0a7de745 1708 if (chain) {
3e170ce0 1709 ip_input_second_pass_loop_tbl(&pktchain_tbl[0], &args);
0a7de745 1710 }
316670eb 1711
3e170ce0
A
1712 if (packet) {
1713 /*
1714 * equivalent update in chaining case if performed in
1715 * ip_input_second_pass_loop_tbl().
1716 */
39037602 1717#if (DEBUG || DEVELOPMENT)
0a7de745 1718 if (ip_input_measure) {
3e170ce0 1719 net_perf_histogram(&net_perf, 1);
0a7de745 1720 }
39037602 1721#endif /* (DEBUG || DEVELOPMENT) */
f427ee49
A
1722 ip_input_second_pass(packet, packet->m_pkthdr.rcvif,
1723 1, packet->m_pkthdr.len, &args);
91447636 1724 }
0b4c1975 1725
0a7de745 1726 if (packet_list) {
3e170ce0 1727 goto restart_list_process;
0a7de745 1728 }
91447636 1729
39037602 1730#if (DEBUG || DEVELOPMENT)
0a7de745 1731 if (ip_input_measure) {
3e170ce0 1732 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
0a7de745 1733 }
39037602 1734#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0 1735}
1c79356b
A
1736/*
1737 * Ip input routine. Checksum and byte swap header. If fragmented
1738 * try to reassemble. Process options. Pass to next level.
1739 */
1740void
1741ip_input(struct mbuf *m)
1742{
1743 struct ip *ip;
eb6b6ca3 1744 unsigned int hlen;
316670eb 1745 u_short sum = 0;
eb6b6ca3 1746#if DUMMYNET
91447636 1747 struct ip_fw_args args;
0a7de745 1748 struct m_tag *tag;
4a3eedf9 1749#endif
39236c6e
A
1750 ipfilter_t inject_filter_ref = NULL;
1751 struct ifnet *inifp;
b0d623f7 1752
6d2010ae
A
1753 /* Check if the mbuf is still valid after interface filter processing */
1754 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
39236c6e
A
1755 inifp = m->m_pkthdr.rcvif;
1756 VERIFY(inifp != NULL);
6d2010ae 1757
3e170ce0
A
1758 ipstat.ips_rxc_notlist++;
1759
316670eb 1760 /* Perform IP header alignment fixup, if needed */
39236c6e
A
1761 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
1762
1763 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
316670eb 1764
f427ee49 1765#if DUMMYNET
0a7de745 1766 bzero(&args, sizeof(struct ip_fw_args));
91447636 1767
b0d623f7
A
1768 /*
1769 * Don't bother searching for tag(s) if there's none.
1770 */
0a7de745 1771 if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
b0d623f7 1772 goto ipfw_tags_done;
0a7de745 1773 }
b0d623f7 1774
91447636 1775 /* Grab info from mtags prepended to the chain */
b0d623f7
A
1776 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1777 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
39236c6e 1778 struct dn_pkt_tag *dn_tag;
b0d623f7 1779
0a7de745 1780 dn_tag = (struct dn_pkt_tag *)(tag + 1);
316670eb 1781 args.fwa_pf_rule = dn_tag->dn_pf_rule;
b0d623f7 1782
91447636
A
1783 m_tag_delete(m, tag);
1784 }
b0d623f7 1785
eb6b6ca3 1786#if DIAGNOSTIC
0a7de745 1787 if (m == NULL || !(m->m_flags & M_PKTHDR)) {
1c79356b 1788 panic("ip_input no HDR");
0a7de745 1789 }
1c79356b 1790#endif
91447636 1791
f427ee49 1792 if (args.fwa_pf_rule) {
39236c6e 1793 /* dummynet already filtered us */
b0d623f7
A
1794 ip = mtod(m, struct ip *);
1795 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1796 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1797 if (args.fwa_pf_rule) {
316670eb 1798 goto check_with_pf;
0a7de745 1799 }
91447636 1800 }
b0d623f7 1801ipfw_tags_done:
f427ee49 1802#endif /* DUMMYNET */
b0d623f7 1803
91447636 1804 /*
316670eb 1805 * No need to process packet twice if we've already seen it.
91447636 1806 */
0a7de745 1807 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
b0d623f7 1808 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1809 }
39236c6e 1810 if (inject_filter_ref != NULL) {
91447636
A
1811 ip = mtod(m, struct ip *);
1812 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
6d2010ae 1813
39236c6e
A
1814 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1815 struct ip *, ip, struct ifnet *, inifp,
1816 struct ip *, ip, struct ip6_hdr *, NULL);
1817
91447636
A
1818 ip->ip_len = ntohs(ip->ip_len) - hlen;
1819 ip->ip_off = ntohs(ip->ip_off);
1820 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1821 return;
1822 }
1823
b0d623f7 1824 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1825 if (m->m_pkthdr.len < sizeof(struct ip)) {
1c79356b 1826 goto tooshort;
0a7de745 1827 }
1c79356b 1828
0a7de745
A
1829 if (m->m_len < sizeof(struct ip) &&
1830 (m = m_pullup(m, sizeof(struct ip))) == NULL) {
b0d623f7 1831 OSAddAtomic(1, &ipstat.ips_toosmall);
1c79356b
A
1832 return;
1833 }
1834 ip = mtod(m, struct ip *);
1835
39236c6e
A
1836 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1837 ip->ip_p, ip->ip_off, ip->ip_len);
1c79356b
A
1838
1839 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
b0d623f7 1840 OSAddAtomic(1, &ipstat.ips_badvers);
1c79356b
A
1841 goto bad;
1842 }
1843
1844 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
0a7de745 1845 if (hlen < sizeof(struct ip)) { /* minimum header length */
b0d623f7 1846 OSAddAtomic(1, &ipstat.ips_badhlen);
1c79356b
A
1847 goto bad;
1848 }
1849 if (hlen > m->m_len) {
39236c6e 1850 if ((m = m_pullup(m, hlen)) == NULL) {
b0d623f7 1851 OSAddAtomic(1, &ipstat.ips_badhlen);
1c79356b
A
1852 return;
1853 }
1854 ip = mtod(m, struct ip *);
1855 }
1856
9bccf70c
A
1857 /* 127/8 must not appear on wire - RFC1122 */
1858 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1859 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
39236c6e
A
1860 /*
1861 * Allow for the following exceptions:
1862 *
1863 * 1. If the packet was sent to loopback (i.e. rcvif
1864 * would have been set earlier at output time.)
1865 *
1866 * 2. If the packet was sent out on loopback from a local
1867 * source address which belongs to a non-loopback
1868 * interface (i.e. rcvif may not necessarily be a
1869 * loopback interface, hence the test for PKTF_LOOP.)
1870 * Unlike IPv6, there is no interface scope ID, and
1871 * therefore we don't care so much about PKTF_IFINFO.
1872 */
1873 if (!(inifp->if_flags & IFF_LOOPBACK) &&
1874 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
b0d623f7 1875 OSAddAtomic(1, &ipstat.ips_badaddr);
9bccf70c
A
1876 goto bad;
1877 }
1878 }
1879
39236c6e
A
1880 /* IPv4 Link-Local Addresses as defined in RFC3927 */
1881 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
9bccf70c
A
1882 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1883 ip_linklocal_stat.iplls_in_total++;
1884 if (ip->ip_ttl != MAXTTL) {
b0d623f7 1885 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
9bccf70c 1886 /* Silently drop link local traffic with bad TTL */
0a7de745 1887 if (!ip_linklocal_in_allowbadttl) {
9bccf70c 1888 goto bad;
0a7de745 1889 }
9bccf70c
A
1890 }
1891 }
1c79356b 1892
316670eb 1893 sum = ip_cksum(m, hlen);
1c79356b 1894 if (sum) {
1c79356b
A
1895 goto bad;
1896 }
1897
39236c6e
A
1898 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1899 struct ip *, ip, struct ifnet *, inifp,
1900 struct ip *, ip, struct ip6_hdr *, NULL);
6d2010ae
A
1901
1902 /*
1903 * Naively assume we can attribute inbound data to the route we would
3e170ce0 1904 * use to send to this destination. Asymmetric routing breaks this
6d2010ae
A
1905 * assumption, but it still allows us to account for traffic from
1906 * a remote node in the routing table.
1907 * this has a very significant performance impact so we bypass
1908 * if nstat_collect is disabled. We may also bypass if the
1909 * protocol is tcp in the future because tcp will have a route that
1910 * we can use to attribute the data to. That does mean we would not
1911 * account for forwarded tcp traffic.
1912 */
1913 if (nstat_collect) {
1914 struct rtentry *rt =
39236c6e 1915 ifnet_cached_rtlookup_inet(inifp, ip->ip_src);
6d2010ae
A
1916 if (rt != NULL) {
1917 nstat_route_rx(rt, 1, m->m_pkthdr.len, 0);
1918 rtfree(rt);
1919 }
1920 }
1921
1c79356b
A
1922 /*
1923 * Convert fields to host representation.
1924 */
b0d623f7 1925#if BYTE_ORDER != BIG_ENDIAN
1c79356b 1926 NTOHS(ip->ip_len);
b0d623f7 1927#endif
39236c6e 1928
1c79356b 1929 if (ip->ip_len < hlen) {
b0d623f7 1930 OSAddAtomic(1, &ipstat.ips_badlen);
1c79356b
A
1931 goto bad;
1932 }
1c79356b 1933
b0d623f7
A
1934#if BYTE_ORDER != BIG_ENDIAN
1935 NTOHS(ip->ip_off);
1936#endif
1c79356b
A
1937 /*
1938 * Check that the amount of data in the buffers
1939 * is as at least much as the IP header would have us expect.
1940 * Trim mbufs if longer than we expect.
1941 * Drop packet if shorter than we expect.
1942 */
1943 if (m->m_pkthdr.len < ip->ip_len) {
1944tooshort:
b0d623f7 1945 OSAddAtomic(1, &ipstat.ips_tooshort);
1c79356b
A
1946 goto bad;
1947 }
1948 if (m->m_pkthdr.len > ip->ip_len) {
5ba3f43e 1949 ip_input_adjust(m, ip, inifp);
1c79356b 1950 }
9bccf70c 1951
316670eb
A
1952#if DUMMYNET
1953check_with_pf:
1954#endif
b0d623f7
A
1955#if PF
1956 /* Invoke inbound packet filter */
316670eb 1957 if (PF_IS_ENABLED) {
6d2010ae 1958 int error;
316670eb 1959#if DUMMYNET
39236c6e 1960 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args);
316670eb 1961#else
39236c6e 1962 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
316670eb
A
1963#endif /* DUMMYNET */
1964 if (error != 0 || m == NULL) {
6d2010ae 1965 if (m != NULL) {
39236c6e
A
1966 panic("%s: unexpected packet %p\n",
1967 __func__, m);
6d2010ae
A
1968 /* NOTREACHED */
1969 }
1970 /* Already freed by callee */
1971 return;
316670eb 1972 }
6d2010ae
A
1973 ip = mtod(m, struct ip *);
1974 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
b0d623f7 1975 }
b0d623f7 1976#endif /* PF */
1c79356b 1977
6d2010ae 1978#if IPSEC
0a7de745 1979 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
6d2010ae 1980 goto pass;
0a7de745 1981 }
6d2010ae
A
1982#endif
1983
9bccf70c 1984pass:
1c79356b
A
1985 /*
1986 * Process options and, if not destined for us,
1987 * ship it on. ip_dooptions returns 1 when an
1988 * error was detected (causing an icmp message
1989 * to be sent and the original packet to be freed).
1990 */
0a7de745 1991 ip_nhops = 0; /* for source routed packets */
0a7de745 1992 if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
1c79356b
A
1993 return;
1994 }
1995
1c79356b
A
1996 /*
1997 * Check our list of addresses, to see if the packet is for us.
9bccf70c
A
1998 * If we don't have any addresses, assume any unicast packet
1999 * we receive might be for us (and let the upper layers deal
2000 * with it).
1c79356b 2001 */
0a7de745 2002 if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST | M_BCAST))) {
39236c6e 2003 ip_setdstifaddr_info(m, inifp->if_index, NULL);
9bccf70c 2004 goto ours;
39236c6e 2005 }
1c79356b 2006
9bccf70c
A
2007 /*
2008 * Enable a consistency check between the destination address
2009 * and the arrival interface for a unicast packet (the RFC 1122
2010 * strong ES model) if IP forwarding is disabled and the packet
2011 * is not locally generated and the packet is not subject to
2012 * 'ipfw fwd'.
2013 *
2014 * XXX - Checking also should be disabled if the destination
2015 * address is ipnat'ed to a different interface.
2016 *
2017 * XXX - Checking is incompatible with IP aliases added
2018 * to the loopback interface instead of the interface where
2019 * the packets are received.
2020 */
eb6b6ca3
A
2021 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
2022 ip_check_if_result_t check_if_result = IP_CHECK_IF_NONE;
9bccf70c 2023
eb6b6ca3
A
2024 check_if_result = ip_input_check_interface(&m, ip, inifp);
2025 ASSERT(check_if_result != IP_CHECK_IF_NONE);
2026 if (check_if_result == IP_CHECK_IF_OURS) {
1c79356b 2027 goto ours;
eb6b6ca3
A
2028 } else if (check_if_result == IP_CHECK_IF_DROP) {
2029 return;
91447636 2030 }
eb6b6ca3 2031 } else {
1c79356b 2032 struct in_multi *inm;
1c79356b
A
2033 /*
2034 * See if we belong to the destination multicast group on the
2035 * arrival interface.
2036 */
6d2010ae 2037 in_multihead_lock_shared();
39236c6e 2038 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
6d2010ae 2039 in_multihead_lock_done();
1c79356b 2040 if (inm == NULL) {
b0d623f7 2041 OSAddAtomic(1, &ipstat.ips_notmember);
1c79356b
A
2042 m_freem(m);
2043 return;
2044 }
39236c6e 2045 ip_setdstifaddr_info(m, inifp->if_index, NULL);
6d2010ae 2046 INM_REMREF(inm);
1c79356b
A
2047 goto ours;
2048 }
0b4e3aa0 2049
1c79356b
A
2050 /*
2051 * Not for us; forward if possible and desirable.
2052 */
2053 if (ipforwarding == 0) {
b0d623f7 2054 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b 2055 m_freem(m);
91447636 2056 } else {
b0d623f7 2057 ip_forward(m, 0, NULL);
91447636 2058 }
1c79356b
A
2059 return;
2060
2061ours:
1c79356b
A
2062 /*
2063 * If offset or IP_MF are set, must reassemble.
1c79356b 2064 */
39236c6e 2065 if (ip->ip_off & ~(IP_DF | IP_RF)) {
39236c6e 2066 m = ip_reass(m);
0a7de745 2067 if (m == NULL) {
39236c6e 2068 return;
0a7de745 2069 }
39236c6e
A
2070 ip = mtod(m, struct ip *);
2071 /* Get the header length of the reassembled packet */
2072 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
39236c6e
A
2073 }
2074
2075 /*
2076 * Further protocols expect the packet length to be w/o the
2077 * IP header.
2078 */
2079 ip->ip_len -= hlen;
1c79356b 2080
1c79356b 2081
9bccf70c
A
2082#if IPSEC
2083 /*
2084 * enforce IPsec policy checking if we are seeing last header.
2085 * note that we do not visit this with protocols with pcb layer
2086 * code - like udp/tcp/raw ip.
2087 */
39236c6e 2088 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
91447636 2089 if (ipsec4_in_reject(m, NULL)) {
2d21ac55 2090 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
39236c6e 2091 goto bad;
91447636 2092 }
1c79356b 2093 }
39236c6e 2094#endif /* IPSEC */
1c79356b
A
2095
2096 /*
2097 * Switch out to protocol's input routine.
2098 */
b0d623f7 2099 OSAddAtomic(1, &ipstat.ips_delivered);
39236c6e 2100
39236c6e 2101 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
39236c6e
A
2102 return;
2103
1c79356b 2104bad:
39236c6e 2105 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1c79356b
A
2106 m_freem(m);
2107}
2108
39236c6e
A
2109static void
2110ipq_updateparams(void)
2111{
5ba3f43e 2112 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
2113 /*
2114 * -1 for unlimited allocation.
2115 */
0a7de745 2116 if (maxnipq < 0) {
39236c6e 2117 ipq_limit = 0;
0a7de745 2118 }
39236c6e
A
2119 /*
2120 * Positive number for specific bound.
2121 */
0a7de745 2122 if (maxnipq > 0) {
39236c6e 2123 ipq_limit = maxnipq;
0a7de745 2124 }
39236c6e
A
2125 /*
2126 * Zero specifies no further fragment queue allocation -- set the
2127 * bound very low, but rely on implementation elsewhere to actually
2128 * prevent allocation and reclaim current queues.
2129 */
0a7de745 2130 if (maxnipq == 0) {
39236c6e 2131 ipq_limit = 1;
0a7de745 2132 }
39236c6e
A
2133 /*
2134 * Arm the purge timer if not already and if there's work to do
2135 */
2136 frag_sched_timeout();
2137}
2138
2139static int
2140sysctl_maxnipq SYSCTL_HANDLER_ARGS
2141{
2142#pragma unused(arg1, arg2)
2143 int error, i;
2144
2145 lck_mtx_lock(&ipqlock);
2146 i = maxnipq;
2147 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 2148 if (error || req->newptr == USER_ADDR_NULL) {
39236c6e 2149 goto done;
0a7de745 2150 }
39236c6e
A
2151 /* impose bounds */
2152 if (i < -1 || i > (nmbclusters / 4)) {
2153 error = EINVAL;
2154 goto done;
2155 }
2156 maxnipq = i;
2157 ipq_updateparams();
2158done:
2159 lck_mtx_unlock(&ipqlock);
0a7de745 2160 return error;
39236c6e
A
2161}
2162
2163static int
2164sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS
2165{
2166#pragma unused(arg1, arg2)
2167 int error, i;
2168
2169 lck_mtx_lock(&ipqlock);
2170 i = maxfragsperpacket;
2171 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 2172 if (error || req->newptr == USER_ADDR_NULL) {
39236c6e 2173 goto done;
0a7de745 2174 }
39236c6e 2175 maxfragsperpacket = i;
0a7de745 2176 ipq_updateparams(); /* see if we need to arm timer */
39236c6e
A
2177done:
2178 lck_mtx_unlock(&ipqlock);
0a7de745 2179 return error;
39236c6e
A
2180}
2181
1c79356b 2182/*
9bccf70c
A
2183 * Take incoming datagram fragment and try to reassemble it into
2184 * whole datagram. If a chain for reassembly of this datagram already
2185 * exists, then it is given as fp; otherwise have to make a chain.
2186 *
5ba3f43e 2187 * The IP header is *NOT* adjusted out of iplen (but in host byte order).
1c79356b 2188 */
9bccf70c 2189static struct mbuf *
39236c6e 2190ip_reass(struct mbuf *m)
1c79356b 2191{
39236c6e
A
2192 struct ip *ip;
2193 struct mbuf *p, *q, *nq, *t;
2194 struct ipq *fp = NULL;
2195 struct ipqhead *head;
2196 int i, hlen, next;
2d21ac55 2197 u_int8_t ecn, ecn0;
39236c6e
A
2198 uint32_t csum, csum_flags;
2199 uint16_t hash;
2200 struct fq_head dfq;
2201
0a7de745 2202 MBUFQ_INIT(&dfq); /* for deferred frees */
39236c6e
A
2203
2204 /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */
2205 if (maxnipq == 0 || maxfragsperpacket == 0) {
2206 ipstat.ips_fragments++;
2207 ipstat.ips_fragdropped++;
2208 m_freem(m);
2209 if (nipq > 0) {
2210 lck_mtx_lock(&ipqlock);
0a7de745 2211 frag_sched_timeout(); /* purge stale fragments */
39236c6e
A
2212 lck_mtx_unlock(&ipqlock);
2213 }
0a7de745 2214 return NULL;
39236c6e
A
2215 }
2216
2217 ip = mtod(m, struct ip *);
2218 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2219
2220 lck_mtx_lock(&ipqlock);
2221
2222 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
2223 head = &ipq[hash];
2224
2225 /*
2226 * Look for queue of fragments
2227 * of this datagram.
2228 */
2229 TAILQ_FOREACH(fp, head, ipq_list) {
2230 if (ip->ip_id == fp->ipq_id &&
2231 ip->ip_src.s_addr == fp->ipq_src.s_addr &&
2232 ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
0a7de745 2233 ip->ip_p == fp->ipq_p) {
39236c6e 2234 goto found;
0a7de745 2235 }
39236c6e
A
2236 }
2237
2238 fp = NULL;
2239
2240 /*
2241 * Attempt to trim the number of allocated fragment queues if it
2242 * exceeds the administrative limit.
2243 */
2244 if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) {
2245 /*
2246 * drop something from the tail of the current queue
2247 * before proceeding further
2248 */
2249 struct ipq *fq = TAILQ_LAST(head, ipqhead);
2250 if (fq == NULL) { /* gak */
2251 for (i = 0; i < IPREASS_NHASH; i++) {
2252 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
2253 if (r) {
2254 ipstat.ips_fragtimeout += r->ipq_nfrags;
2255 frag_freef(&ipq[i], r);
2256 break;
2257 }
2258 }
2259 } else {
2260 ipstat.ips_fragtimeout += fq->ipq_nfrags;
2261 frag_freef(head, fq);
2262 }
2263 }
2264
2265found:
2266 /*
2267 * Leverage partial checksum offload for IP fragments. Narrow down
2268 * the scope to cover only UDP without IP options, as that is the
2269 * most common case.
2270 *
2271 * Perform 1's complement adjustment of octets that got included/
2272 * excluded in the hardware-calculated checksum value. Ignore cases
5ba3f43e
A
2273 * where the value includes the entire IPv4 header span, as the sum
2274 * for those octets would already be 0 by the time we get here; IP
2275 * has already performed its header checksum validation. Also take
2276 * care of any trailing bytes and subtract out their partial sum.
39236c6e 2277 */
0a7de745 2278 if (ip->ip_p == IPPROTO_UDP && hlen == sizeof(struct ip) &&
39236c6e
A
2279 (m->m_pkthdr.csum_flags &
2280 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
2281 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
2282 uint32_t start = m->m_pkthdr.csum_rx_start;
2283 int32_t trailer = (m_pktlen(m) - ip->ip_len);
2284 uint32_t swbytes = (uint32_t)trailer;
39236c6e 2285
39236c6e 2286 csum = m->m_pkthdr.csum_rx_val;
1c79356b 2287
5ba3f43e
A
2288 ASSERT(trailer >= 0);
2289 if ((start != 0 && start != hlen) || trailer != 0) {
cb323159
A
2290 uint32_t datalen = ip->ip_len - hlen;
2291
39236c6e
A
2292#if BYTE_ORDER != BIG_ENDIAN
2293 if (start < hlen) {
2294 HTONS(ip->ip_len);
2295 HTONS(ip->ip_off);
2296 }
5ba3f43e 2297#endif /* BYTE_ORDER != BIG_ENDIAN */
39236c6e 2298 /* callee folds in sum */
cb323159 2299 csum = m_adj_sum16(m, start, hlen, datalen, csum);
0a7de745 2300 if (hlen > start) {
5ba3f43e 2301 swbytes += (hlen - start);
0a7de745 2302 } else {
5ba3f43e 2303 swbytes += (start - hlen);
0a7de745 2304 }
39236c6e
A
2305#if BYTE_ORDER != BIG_ENDIAN
2306 if (start < hlen) {
2307 NTOHS(ip->ip_off);
2308 NTOHS(ip->ip_len);
2309 }
5ba3f43e 2310#endif /* BYTE_ORDER != BIG_ENDIAN */
39236c6e
A
2311 }
2312 csum_flags = m->m_pkthdr.csum_flags;
5ba3f43e 2313
0a7de745 2314 if (swbytes != 0) {
5ba3f43e 2315 udp_in_cksum_stats(swbytes);
0a7de745
A
2316 }
2317 if (trailer != 0) {
5ba3f43e 2318 m_adj(m, -trailer);
0a7de745 2319 }
39236c6e
A
2320 } else {
2321 csum = 0;
2322 csum_flags = 0;
2323 }
2324
2325 /* Invalidate checksum */
2326 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
2327
2328 ipstat.ips_fragments++;
2329
2330 /*
2331 * Adjust ip_len to not reflect header,
2332 * convert offset of this to bytes.
2333 */
2334 ip->ip_len -= hlen;
2335 if (ip->ip_off & IP_MF) {
2336 /*
2337 * Make sure that fragments have a data length
2338 * that's a non-zero multiple of 8 bytes.
2339 */
2340 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
2341 OSAddAtomic(1, &ipstat.ips_toosmall);
2342 /*
2343 * Reassembly queue may have been found if previous
2344 * fragments were valid; given that this one is bad,
2345 * we need to drop it. Make sure to set fp to NULL
2346 * if not already, since we don't want to decrement
2347 * ipq_nfrags as it doesn't include this packet.
2348 */
2349 fp = NULL;
2350 goto dropfrag;
2351 }
2352 m->m_flags |= M_FRAG;
2353 } else {
2354 /* Clear the flag in case packet comes from loopback */
2355 m->m_flags &= ~M_FRAG;
2356 }
2357 ip->ip_off <<= 3;
2358
2359 m->m_pkthdr.pkt_hdr = ip;
2360
2361 /* Previous ip_reass() started here. */
1c79356b
A
2362 /*
2363 * Presence of header sizes in mbufs
2364 * would confuse code below.
2365 */
2366 m->m_data += hlen;
2367 m->m_len -= hlen;
2368
2369 /*
2370 * If first fragment to arrive, create a reassembly queue.
2371 */
39236c6e
A
2372 if (fp == NULL) {
2373 fp = ipq_alloc(M_DONTWAIT);
0a7de745 2374 if (fp == NULL) {
1c79356b 2375 goto dropfrag;
0a7de745 2376 }
39236c6e 2377 TAILQ_INSERT_HEAD(head, fp, ipq_list);
1c79356b 2378 nipq++;
483a1d10 2379 fp->ipq_nfrags = 1;
1c79356b
A
2380 fp->ipq_ttl = IPFRAGTTL;
2381 fp->ipq_p = ip->ip_p;
2382 fp->ipq_id = ip->ip_id;
2383 fp->ipq_src = ip->ip_src;
2384 fp->ipq_dst = ip->ip_dst;
2385 fp->ipq_frags = m;
2386 m->m_nextpkt = NULL;
39236c6e
A
2387 /*
2388 * If the first fragment has valid checksum offload
2389 * info, the rest of fragments are eligible as well.
2390 */
2391 if (csum_flags != 0) {
2392 fp->ipq_csum = csum;
2393 fp->ipq_csum_flags = csum_flags;
2394 }
0a7de745 2395 m = NULL; /* nothing to return */
39236c6e 2396 goto done;
483a1d10
A
2397 } else {
2398 fp->ipq_nfrags++;
1c79356b
A
2399 }
2400
0a7de745 2401#define GETIP(m) ((struct ip *)((m)->m_pkthdr.pkt_hdr))
1c79356b 2402
2d21ac55
A
2403 /*
2404 * Handle ECN by comparing this segment with the first one;
2405 * if CE is set, do not lose CE.
2406 * drop if CE and not-ECT are mixed for the same packet.
2407 */
2408 ecn = ip->ip_tos & IPTOS_ECN_MASK;
2409 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
2410 if (ecn == IPTOS_ECN_CE) {
0a7de745 2411 if (ecn0 == IPTOS_ECN_NOTECT) {
2d21ac55 2412 goto dropfrag;
0a7de745
A
2413 }
2414 if (ecn0 != IPTOS_ECN_CE) {
2d21ac55 2415 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
0a7de745 2416 }
2d21ac55 2417 }
0a7de745 2418 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
2d21ac55 2419 goto dropfrag;
0a7de745 2420 }
2d21ac55 2421
1c79356b
A
2422 /*
2423 * Find a segment which begins after this one does.
2424 */
0a7de745
A
2425 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2426 if (GETIP(q)->ip_off > ip->ip_off) {
1c79356b 2427 break;
0a7de745
A
2428 }
2429 }
1c79356b
A
2430
2431 /*
2432 * If there is a preceding segment, it may provide some of
2433 * our data already. If so, drop the data from the incoming
2434 * segment. If it provides all of our data, drop us, otherwise
2435 * stick new segment in the proper place.
9bccf70c 2436 *
39236c6e 2437 * If some of the data is dropped from the preceding
9bccf70c 2438 * segment, then it's checksum is invalidated.
1c79356b
A
2439 */
2440 if (p) {
2441 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
2442 if (i > 0) {
0a7de745 2443 if (i >= ip->ip_len) {
1c79356b 2444 goto dropfrag;
0a7de745 2445 }
9bccf70c 2446 m_adj(m, i);
39236c6e 2447 fp->ipq_csum_flags = 0;
1c79356b
A
2448 ip->ip_off += i;
2449 ip->ip_len -= i;
2450 }
2451 m->m_nextpkt = p->m_nextpkt;
2452 p->m_nextpkt = m;
2453 } else {
2454 m->m_nextpkt = fp->ipq_frags;
2455 fp->ipq_frags = m;
2456 }
2457
2458 /*
2459 * While we overlap succeeding segments trim them or,
2460 * if they are completely covered, dequeue them.
2461 */
2462 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
39236c6e
A
2463 q = nq) {
2464 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
1c79356b
A
2465 if (i < GETIP(q)->ip_len) {
2466 GETIP(q)->ip_len -= i;
2467 GETIP(q)->ip_off += i;
2468 m_adj(q, i);
39236c6e 2469 fp->ipq_csum_flags = 0;
1c79356b
A
2470 break;
2471 }
2472 nq = q->m_nextpkt;
2473 m->m_nextpkt = nq;
39236c6e 2474 ipstat.ips_fragdropped++;
483a1d10 2475 fp->ipq_nfrags--;
39236c6e
A
2476 /* defer freeing until after lock is dropped */
2477 MBUFQ_ENQUEUE(&dfq, q);
1c79356b
A
2478 }
2479
39236c6e
A
2480 /*
2481 * If this fragment contains similar checksum offload info
2482 * as that of the existing ones, accumulate checksum. Otherwise,
2483 * invalidate checksum offload info for the entire datagram.
2484 */
0a7de745 2485 if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags) {
39236c6e 2486 fp->ipq_csum += csum;
0a7de745 2487 } else if (fp->ipq_csum_flags != 0) {
39236c6e 2488 fp->ipq_csum_flags = 0;
0a7de745 2489 }
1c79356b 2490
1c79356b
A
2491
2492 /*
483a1d10
A
2493 * Check for complete reassembly and perform frag per packet
2494 * limiting.
2495 *
2496 * Frag limiting is performed here so that the nth frag has
2497 * a chance to complete the packet before we drop the packet.
2498 * As a result, n+1 frags are actually allowed per packet, but
2499 * only n will ever be stored. (n = maxfragsperpacket.)
2500 *
1c79356b
A
2501 */
2502 next = 0;
2503 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
483a1d10
A
2504 if (GETIP(q)->ip_off != next) {
2505 if (fp->ipq_nfrags > maxfragsperpacket) {
39236c6e
A
2506 ipstat.ips_fragdropped += fp->ipq_nfrags;
2507 frag_freef(head, fp);
483a1d10 2508 }
0a7de745 2509 m = NULL; /* nothing to return */
39236c6e 2510 goto done;
483a1d10 2511 }
1c79356b
A
2512 next += GETIP(q)->ip_len;
2513 }
2514 /* Make sure the last packet didn't have the IP_MF flag */
483a1d10
A
2515 if (p->m_flags & M_FRAG) {
2516 if (fp->ipq_nfrags > maxfragsperpacket) {
39236c6e
A
2517 ipstat.ips_fragdropped += fp->ipq_nfrags;
2518 frag_freef(head, fp);
483a1d10 2519 }
0a7de745 2520 m = NULL; /* nothing to return */
39236c6e 2521 goto done;
483a1d10 2522 }
1c79356b
A
2523
2524 /*
2525 * Reassembly is complete. Make sure the packet is a sane size.
2526 */
2527 q = fp->ipq_frags;
2528 ip = GETIP(q);
2529 if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
39236c6e
A
2530 ipstat.ips_toolong++;
2531 ipstat.ips_fragdropped += fp->ipq_nfrags;
2532 frag_freef(head, fp);
0a7de745 2533 m = NULL; /* nothing to return */
39236c6e 2534 goto done;
1c79356b
A
2535 }
2536
2537 /*
2538 * Concatenate fragments.
2539 */
2540 m = q;
2541 t = m->m_next;
39236c6e 2542 m->m_next = NULL;
1c79356b
A
2543 m_cat(m, t);
2544 nq = q->m_nextpkt;
39236c6e 2545 q->m_nextpkt = NULL;
1c79356b
A
2546 for (q = nq; q != NULL; q = nq) {
2547 nq = q->m_nextpkt;
2548 q->m_nextpkt = NULL;
2549 m_cat(m, q);
2550 }
2551
39236c6e
A
2552 /*
2553 * Store partial hardware checksum info from the fragment queue;
2554 * the receive start offset is set to 20 bytes (see code at the
2555 * top of this routine.)
2556 */
2557 if (fp->ipq_csum_flags != 0) {
2558 csum = fp->ipq_csum;
2559
2560 ADDCARRY(csum);
2561
2562 m->m_pkthdr.csum_rx_val = csum;
0a7de745 2563 m->m_pkthdr.csum_rx_start = sizeof(struct ip);
39236c6e
A
2564 m->m_pkthdr.csum_flags = fp->ipq_csum_flags;
2565 } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
2566 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2567 /* loopback checksums are always OK */
2568 m->m_pkthdr.csum_data = 0xffff;
39236c6e
A
2569 m->m_pkthdr.csum_flags =
2570 CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
2571 CSUM_IP_CHECKED | CSUM_IP_VALID;
2572 }
2573
1c79356b 2574 /*
39236c6e
A
2575 * Create header for new ip packet by modifying header of first
2576 * packet; dequeue and discard fragment reassembly header.
1c79356b
A
2577 * Make header visible.
2578 */
39236c6e 2579 ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next;
1c79356b
A
2580 ip->ip_src = fp->ipq_src;
2581 ip->ip_dst = fp->ipq_dst;
39236c6e 2582
0a7de745 2583 fp->ipq_frags = NULL; /* return to caller as 'm' */
39236c6e
A
2584 frag_freef(head, fp);
2585 fp = NULL;
2586
1c79356b
A
2587 m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
2588 m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
2589 /* some debugging cruft by sklower, below, will go away soon */
0a7de745 2590 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
39236c6e 2591 m_fixhdr(m);
0a7de745 2592 }
39236c6e
A
2593 ipstat.ips_reassembled++;
2594
2595 /* arm the purge timer if not already and if there's work to do */
2596 frag_sched_timeout();
2597 lck_mtx_unlock(&ipqlock);
2598 /* perform deferred free (if needed) now that lock is dropped */
0a7de745 2599 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 2600 MBUFQ_DRAIN(&dfq);
0a7de745 2601 }
39236c6e 2602 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 2603 return m;
1c79356b 2604
39236c6e
A
2605done:
2606 VERIFY(m == NULL);
2607 /* arm the purge timer if not already and if there's work to do */
2608 frag_sched_timeout();
2609 lck_mtx_unlock(&ipqlock);
2610 /* perform deferred free (if needed) */
0a7de745 2611 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 2612 MBUFQ_DRAIN(&dfq);
0a7de745 2613 }
39236c6e 2614 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 2615 return NULL;
39236c6e 2616
1c79356b 2617dropfrag:
39236c6e 2618 ipstat.ips_fragdropped++;
0a7de745 2619 if (fp != NULL) {
483a1d10 2620 fp->ipq_nfrags--;
0a7de745 2621 }
39236c6e
A
2622 /* arm the purge timer if not already and if there's work to do */
2623 frag_sched_timeout();
2624 lck_mtx_unlock(&ipqlock);
1c79356b 2625 m_freem(m);
39236c6e 2626 /* perform deferred free (if needed) */
0a7de745 2627 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 2628 MBUFQ_DRAIN(&dfq);
0a7de745 2629 }
39236c6e 2630 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 2631 return NULL;
1c79356b
A
2632#undef GETIP
2633}
2634
2635/*
2636 * Free a fragment reassembly header and all
2637 * associated datagrams.
2638 */
2639static void
39236c6e 2640frag_freef(struct ipqhead *fhp, struct ipq *fp)
1c79356b 2641{
5ba3f43e 2642 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
2643
2644 fp->ipq_nfrags = 0;
2645 if (fp->ipq_frags != NULL) {
2646 m_freem_list(fp->ipq_frags);
2647 fp->ipq_frags = NULL;
2648 }
2649 TAILQ_REMOVE(fhp, fp, ipq_list);
1c79356b 2650 nipq--;
39236c6e 2651 ipq_free(fp);
1c79356b
A
2652}
2653
2654/*
39236c6e 2655 * IP reassembly timer processing
1c79356b 2656 */
39236c6e
A
2657static void
2658frag_timeout(void *arg)
1c79356b 2659{
39236c6e 2660#pragma unused(arg)
2d21ac55 2661 struct ipq *fp;
1c79356b 2662 int i;
39236c6e
A
2663
2664 /*
2665 * Update coarse-grained networking timestamp (in sec.); the idea
2666 * is to piggy-back on the timeout callout to update the counter
2667 * returnable via net_uptime().
2668 */
2669 net_update_uptime();
2670
2671 lck_mtx_lock(&ipqlock);
1c79356b 2672 for (i = 0; i < IPREASS_NHASH; i++) {
0a7de745 2673 for (fp = TAILQ_FIRST(&ipq[i]); fp;) {
39236c6e
A
2674 struct ipq *fpp;
2675
2676 fpp = fp;
2677 fp = TAILQ_NEXT(fp, ipq_list);
2678 if (--fpp->ipq_ttl == 0) {
2679 ipstat.ips_fragtimeout += fpp->ipq_nfrags;
2680 frag_freef(&ipq[i], fpp);
1c79356b
A
2681 }
2682 }
2683 }
9bccf70c
A
2684 /*
2685 * If we are over the maximum number of fragments
2686 * (due to the limit being lowered), drain off
2687 * enough to get down to the new limit.
2688 */
39236c6e
A
2689 if (maxnipq >= 0 && nipq > (unsigned)maxnipq) {
2690 for (i = 0; i < IPREASS_NHASH; i++) {
2691 while (nipq > (unsigned)maxnipq &&
2692 !TAILQ_EMPTY(&ipq[i])) {
2693 ipstat.ips_fragdropped +=
2694 TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2695 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
9bccf70c
A
2696 }
2697 }
2698 }
39236c6e
A
2699 /* re-arm the purge timer if there's work to do */
2700 frag_timeout_run = 0;
2701 frag_sched_timeout();
2702 lck_mtx_unlock(&ipqlock);
2703}
2704
2705static void
2706frag_sched_timeout(void)
2707{
5ba3f43e 2708 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
2709
2710 if (!frag_timeout_run && nipq > 0) {
2711 frag_timeout_run = 1;
2712 timeout(frag_timeout, NULL, hz);
2713 }
1c79356b
A
2714}
2715
2716/*
2717 * Drain off all datagram fragments.
2718 */
39236c6e
A
2719static void
2720frag_drain(void)
1c79356b 2721{
39236c6e 2722 int i;
1c79356b 2723
39236c6e 2724 lck_mtx_lock(&ipqlock);
1c79356b 2725 for (i = 0; i < IPREASS_NHASH; i++) {
39236c6e
A
2726 while (!TAILQ_EMPTY(&ipq[i])) {
2727 ipstat.ips_fragdropped +=
2728 TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2729 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
1c79356b
A
2730 }
2731 }
39236c6e
A
2732 lck_mtx_unlock(&ipqlock);
2733}
2734
2735static struct ipq *
2736ipq_alloc(int how)
2737{
2738 struct mbuf *t;
2739 struct ipq *fp;
2740
2741 /*
2742 * See comments in ipq_updateparams(). Keep the count separate
2743 * from nipq since the latter represents the elements already
2744 * in the reassembly queues.
2745 */
0a7de745
A
2746 if (ipq_limit > 0 && ipq_count > ipq_limit) {
2747 return NULL;
2748 }
39236c6e
A
2749
2750 t = m_get(how, MT_FTABLE);
2751 if (t != NULL) {
2752 atomic_add_32(&ipq_count, 1);
2753 fp = mtod(t, struct ipq *);
0a7de745 2754 bzero(fp, sizeof(*fp));
39236c6e
A
2755 } else {
2756 fp = NULL;
2757 }
0a7de745 2758 return fp;
39236c6e
A
2759}
2760
2761static void
2762ipq_free(struct ipq *fp)
2763{
2764 (void) m_free(dtom(fp));
2765 atomic_add_32(&ipq_count, -1);
2766}
2767
2768/*
2769 * Drain callback
2770 */
2771void
2772ip_drain(void)
2773{
0a7de745
A
2774 frag_drain(); /* fragments */
2775 in_rtqdrain(); /* protocol cloned routes */
2776 in_arpdrain(NULL); /* cloned routes: ARP */
1c79356b
A
2777}
2778
2779/*
2780 * Do option processing on a datagram,
2781 * possibly discarding it if bad options are encountered,
2782 * or forwarding it if source-routed.
91447636
A
2783 * The pass argument is used when operating in the IPSTEALTH
2784 * mode to tell what options to process:
2785 * [LS]SRR (pass 0) or the others (pass 1).
2786 * The reason for as many as two passes is that when doing IPSTEALTH,
2787 * non-routing options should be processed only if the packet is for us.
1c79356b
A
2788 * Returns 1 if packet has been forwarded/freed,
2789 * 0 if the packet should be processed further.
2790 */
2791static int
39236c6e 2792ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
1c79356b 2793{
39236c6e 2794#pragma unused(pass)
2d21ac55
A
2795 struct ip *ip = mtod(m, struct ip *);
2796 u_char *cp;
2797 struct ip_timestamp *ipt;
2798 struct in_ifaddr *ia;
1c79356b
A
2799 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
2800 struct in_addr *sin, dst;
04b8595b 2801 u_int32_t ntime;
b0d623f7 2802 struct sockaddr_in ipaddr = {
cb323159
A
2803 .sin_len = sizeof(ipaddr),
2804 .sin_family = AF_INET,
2805 .sin_port = 0,
2806 .sin_addr = { .s_addr = 0 },
2807 .sin_zero = { 0, }
0a7de745 2808 };
1c79356b 2809
316670eb
A
2810 /* Expect 32-bit aligned data pointer on strict-align platforms */
2811 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
2812
1c79356b
A
2813 dst = ip->ip_dst;
2814 cp = (u_char *)(ip + 1);
0a7de745 2815 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1c79356b
A
2816 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2817 opt = cp[IPOPT_OPTVAL];
0a7de745 2818 if (opt == IPOPT_EOL) {
1c79356b 2819 break;
0a7de745
A
2820 }
2821 if (opt == IPOPT_NOP) {
1c79356b 2822 optlen = 1;
0a7de745
A
2823 } else {
2824 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
9bccf70c 2825 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1c79356b
A
2826 goto bad;
2827 }
2828 optlen = cp[IPOPT_OLEN];
0a7de745 2829 if (optlen < IPOPT_OLEN + sizeof(*cp) ||
39236c6e 2830 optlen > cnt) {
1c79356b
A
2831 code = &cp[IPOPT_OLEN] - (u_char *)ip;
2832 goto bad;
2833 }
2834 }
2835 switch (opt) {
1c79356b
A
2836 default:
2837 break;
2838
2839 /*
2840 * Source routing with record.
2841 * Find interface with current destination address.
2842 * If none on this machine then drop if strictly routed,
2843 * or do nothing if loosely routed.
2844 * Record interface address and bring up next address
2845 * component. If strictly routed make sure next
2846 * address is on directly accessible net.
2847 */
2848 case IPOPT_LSRR:
2849 case IPOPT_SSRR:
0a7de745 2850 if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
9bccf70c
A
2851 code = &cp[IPOPT_OLEN] - (u_char *)ip;
2852 goto bad;
2853 }
1c79356b
A
2854 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2855 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2856 goto bad;
2857 }
2858 ipaddr.sin_addr = ip->ip_dst;
39236c6e
A
2859 ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr));
2860 if (ia == NULL) {
1c79356b
A
2861 if (opt == IPOPT_SSRR) {
2862 type = ICMP_UNREACH;
2863 code = ICMP_UNREACH_SRCFAIL;
2864 goto bad;
2865 }
0a7de745 2866 if (!ip_dosourceroute) {
1c79356b 2867 goto nosourcerouting;
0a7de745 2868 }
1c79356b
A
2869 /*
2870 * Loose routing, and not at next destination
2871 * yet; nothing to do except forward.
2872 */
2873 break;
39236c6e 2874 } else {
6d2010ae 2875 IFA_REMREF(&ia->ia_ifa);
91447636
A
2876 ia = NULL;
2877 }
0a7de745
A
2878 off--; /* 0 origin */
2879 if (off > optlen - (int)sizeof(struct in_addr)) {
1c79356b
A
2880 /*
2881 * End of source route. Should be for us.
2882 */
0a7de745 2883 if (!ip_acceptsourceroute) {
1c79356b 2884 goto nosourcerouting;
0a7de745 2885 }
1c79356b
A
2886 save_rte(cp, ip->ip_src);
2887 break;
2888 }
2889
2890 if (!ip_dosourceroute) {
2891 if (ipforwarding) {
91447636
A
2892 char buf[MAX_IPv4_STR_LEN];
2893 char buf2[MAX_IPv4_STR_LEN];
1c79356b
A
2894 /*
2895 * Acting as a router, so generate ICMP
2896 */
2897nosourcerouting:
91447636 2898 log(LOG_WARNING,
39236c6e
A
2899 "attempted source route from %s "
2900 "to %s\n",
2901 inet_ntop(AF_INET, &ip->ip_src,
0a7de745 2902 buf, sizeof(buf)),
39236c6e 2903 inet_ntop(AF_INET, &ip->ip_dst,
0a7de745 2904 buf2, sizeof(buf2)));
1c79356b
A
2905 type = ICMP_UNREACH;
2906 code = ICMP_UNREACH_SRCFAIL;
2907 goto bad;
2908 } else {
2909 /*
39236c6e
A
2910 * Not acting as a router,
2911 * so silently drop.
1c79356b 2912 */
b0d623f7 2913 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b 2914 m_freem(m);
0a7de745 2915 return 1;
1c79356b
A
2916 }
2917 }
2918
2919 /*
2920 * locate outgoing interface
2921 */
39236c6e 2922 (void) memcpy(&ipaddr.sin_addr, cp + off,
0a7de745 2923 sizeof(ipaddr.sin_addr));
1c79356b
A
2924
2925 if (opt == IPOPT_SSRR) {
0a7de745 2926#define INA struct in_ifaddr *
316670eb 2927 if ((ia = (INA)ifa_ifwithdstaddr(
0a7de745 2928 SA(&ipaddr))) == NULL) {
39236c6e 2929 ia = (INA)ifa_ifwithnet(SA(&ipaddr));
91447636
A
2930 }
2931 } else {
b0d623f7 2932 ia = ip_rtaddr(ipaddr.sin_addr);
91447636 2933 }
39236c6e 2934 if (ia == NULL) {
1c79356b
A
2935 type = ICMP_UNREACH;
2936 code = ICMP_UNREACH_SRCFAIL;
2937 goto bad;
2938 }
2939 ip->ip_dst = ipaddr.sin_addr;
6d2010ae 2940 IFA_LOCK(&ia->ia_ifa);
39236c6e 2941 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
0a7de745 2942 sizeof(struct in_addr));
6d2010ae
A
2943 IFA_UNLOCK(&ia->ia_ifa);
2944 IFA_REMREF(&ia->ia_ifa);
91447636 2945 ia = NULL;
0a7de745 2946 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1c79356b
A
2947 /*
2948 * Let ip_intr's mcast routing check handle mcast pkts
2949 */
2950 forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
2951 break;
2952
2953 case IPOPT_RR:
0a7de745 2954 if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1c79356b
A
2955 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2956 goto bad;
2957 }
2958 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2959 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
2960 goto bad;
2961 }
2962 /*
2963 * If no space remains, ignore.
2964 */
0a7de745
A
2965 off--; /* 0 origin */
2966 if (off > optlen - (int)sizeof(struct in_addr)) {
1c79356b 2967 break;
0a7de745 2968 }
39236c6e 2969 (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst,
0a7de745 2970 sizeof(ipaddr.sin_addr));
1c79356b
A
2971 /*
2972 * locate outgoing interface; if we're the destination,
2973 * use the incoming interface (should be same).
2974 */
39236c6e
A
2975 if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) {
2976 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
91447636
A
2977 type = ICMP_UNREACH;
2978 code = ICMP_UNREACH_HOST;
2979 goto bad;
2980 }
1c79356b 2981 }
6d2010ae 2982 IFA_LOCK(&ia->ia_ifa);
39236c6e 2983 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
0a7de745 2984 sizeof(struct in_addr));
6d2010ae
A
2985 IFA_UNLOCK(&ia->ia_ifa);
2986 IFA_REMREF(&ia->ia_ifa);
91447636 2987 ia = NULL;
0a7de745 2988 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1c79356b
A
2989 break;
2990
2991 case IPOPT_TS:
2992 code = cp - (u_char *)ip;
316670eb 2993 ipt = (struct ip_timestamp *)(void *)cp;
9bccf70c
A
2994 if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
2995 code = (u_char *)&ipt->ipt_len - (u_char *)ip;
1c79356b 2996 goto bad;
9bccf70c
A
2997 }
2998 if (ipt->ipt_ptr < 5) {
2999 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
3000 goto bad;
3001 }
3002 if (ipt->ipt_ptr >
0a7de745 3003 ipt->ipt_len - (int)sizeof(int32_t)) {
9bccf70c
A
3004 if (++ipt->ipt_oflw == 0) {
3005 code = (u_char *)&ipt->ipt_ptr -
3006 (u_char *)ip;
1c79356b 3007 goto bad;
9bccf70c 3008 }
1c79356b
A
3009 break;
3010 }
316670eb 3011 sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1);
1c79356b 3012 switch (ipt->ipt_flg) {
1c79356b
A
3013 case IPOPT_TS_TSONLY:
3014 break;
3015
3016 case IPOPT_TS_TSANDADDR:
0a7de745
A
3017 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3018 sizeof(struct in_addr) > ipt->ipt_len) {
9bccf70c
A
3019 code = (u_char *)&ipt->ipt_ptr -
3020 (u_char *)ip;
1c79356b 3021 goto bad;
9bccf70c 3022 }
1c79356b 3023 ipaddr.sin_addr = dst;
39236c6e
A
3024 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr),
3025 m->m_pkthdr.rcvif);
0a7de745 3026 if (ia == NULL) {
1c79356b 3027 continue;
0a7de745 3028 }
6d2010ae 3029 IFA_LOCK(&ia->ia_ifa);
39236c6e 3030 (void) memcpy(sin, &IA_SIN(ia)->sin_addr,
0a7de745 3031 sizeof(struct in_addr));
6d2010ae 3032 IFA_UNLOCK(&ia->ia_ifa);
0a7de745 3033 ipt->ipt_ptr += sizeof(struct in_addr);
6d2010ae 3034 IFA_REMREF(&ia->ia_ifa);
91447636 3035 ia = NULL;
1c79356b
A
3036 break;
3037
3038 case IPOPT_TS_PRESPEC:
0a7de745
A
3039 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3040 sizeof(struct in_addr) > ipt->ipt_len) {
9bccf70c
A
3041 code = (u_char *)&ipt->ipt_ptr -
3042 (u_char *)ip;
1c79356b 3043 goto bad;
9bccf70c 3044 }
39236c6e 3045 (void) memcpy(&ipaddr.sin_addr, sin,
0a7de745 3046 sizeof(struct in_addr));
39236c6e 3047 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr(
0a7de745 3048 SA(&ipaddr))) == NULL) {
1c79356b 3049 continue;
0a7de745 3050 }
6d2010ae 3051 IFA_REMREF(&ia->ia_ifa);
91447636 3052 ia = NULL;
0a7de745 3053 ipt->ipt_ptr += sizeof(struct in_addr);
1c79356b
A
3054 break;
3055
3056 default:
9bccf70c
A
3057 /* XXX can't take &ipt->ipt_flg */
3058 code = (u_char *)&ipt->ipt_ptr -
3059 (u_char *)ip + 1;
1c79356b
A
3060 goto bad;
3061 }
3062 ntime = iptime();
39236c6e 3063 (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime,
0a7de745
A
3064 sizeof(n_time));
3065 ipt->ipt_ptr += sizeof(n_time);
1c79356b
A
3066 }
3067 }
3068 if (forward && ipforwarding) {
b0d623f7 3069 ip_forward(m, 1, next_hop);
0a7de745 3070 return 1;
1c79356b 3071 }
0a7de745 3072 return 0;
1c79356b 3073bad:
1c79356b 3074 icmp_error(m, type, code, 0, 0);
b0d623f7 3075 OSAddAtomic(1, &ipstat.ips_badoptions);
0a7de745 3076 return 1;
1c79356b
A
3077}
3078
39236c6e
A
3079/*
3080 * Check for the presence of the IP Router Alert option [RFC2113]
3081 * in the header of an IPv4 datagram.
3082 *
3083 * This call is not intended for use from the forwarding path; it is here
3084 * so that protocol domains may check for the presence of the option.
3085 * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
3086 * option does not have much relevance to the implementation, though this
3087 * may change in future.
3088 * Router alert options SHOULD be passed if running in IPSTEALTH mode and
3089 * we are not the endpoint.
3090 * Length checks on individual options should already have been peformed
3091 * by ip_dooptions() therefore they are folded under DIAGNOSTIC here.
3092 *
3093 * Return zero if not present or options are invalid, non-zero if present.
3094 */
3095int
3096ip_checkrouteralert(struct mbuf *m)
3097{
3098 struct ip *ip = mtod(m, struct ip *);
3099 u_char *cp;
3100 int opt, optlen, cnt, found_ra;
3101
3102 found_ra = 0;
3103 cp = (u_char *)(ip + 1);
0a7de745 3104 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
39236c6e
A
3105 for (; cnt > 0; cnt -= optlen, cp += optlen) {
3106 opt = cp[IPOPT_OPTVAL];
0a7de745 3107 if (opt == IPOPT_EOL) {
39236c6e 3108 break;
0a7de745
A
3109 }
3110 if (opt == IPOPT_NOP) {
39236c6e 3111 optlen = 1;
0a7de745 3112 } else {
39236c6e 3113#ifdef DIAGNOSTIC
0a7de745 3114 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
39236c6e 3115 break;
0a7de745 3116 }
39236c6e
A
3117#endif
3118 optlen = cp[IPOPT_OLEN];
3119#ifdef DIAGNOSTIC
0a7de745 3120 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
39236c6e 3121 break;
0a7de745 3122 }
39236c6e
A
3123#endif
3124 }
3125 switch (opt) {
3126 case IPOPT_RA:
3127#ifdef DIAGNOSTIC
0a7de745
A
3128 if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
3129 (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0)) {
39236c6e 3130 break;
0a7de745 3131 } else
39236c6e 3132#endif
0a7de745 3133 found_ra = 1;
39236c6e
A
3134 break;
3135 default:
3136 break;
3137 }
3138 }
3139
0a7de745 3140 return found_ra;
39236c6e
A
3141}
3142
1c79356b
A
3143/*
3144 * Given address of next destination (final or next hop),
3145 * return internet address info of interface to be used to get there.
3146 */
91447636 3147struct in_ifaddr *
b0d623f7 3148ip_rtaddr(struct in_addr dst)
1c79356b 3149{
2d21ac55 3150 struct sockaddr_in *sin;
b0d623f7
A
3151 struct ifaddr *rt_ifa;
3152 struct route ro;
3153
0a7de745 3154 bzero(&ro, sizeof(ro));
39236c6e 3155 sin = SIN(&ro.ro_dst);
b0d623f7 3156 sin->sin_family = AF_INET;
0a7de745 3157 sin->sin_len = sizeof(*sin);
b0d623f7
A
3158 sin->sin_addr = dst;
3159
3160 rtalloc_ign(&ro, RTF_PRCLONING);
39236c6e
A
3161 if (ro.ro_rt == NULL) {
3162 ROUTE_RELEASE(&ro);
0a7de745 3163 return NULL;
39236c6e 3164 }
b0d623f7
A
3165
3166 RT_LOCK(ro.ro_rt);
0a7de745 3167 if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) {
6d2010ae 3168 IFA_ADDREF(rt_ifa);
0a7de745 3169 }
b0d623f7 3170 RT_UNLOCK(ro.ro_rt);
39236c6e 3171 ROUTE_RELEASE(&ro);
b0d623f7 3172
0a7de745 3173 return (struct in_ifaddr *)rt_ifa;
1c79356b
A
3174}
3175
3176/*
3177 * Save incoming source route for use in replies,
3178 * to be picked up later by ip_srcroute if the receiver is interested.
3179 */
3180void
2d21ac55 3181save_rte(u_char *option, struct in_addr dst)
1c79356b
A
3182{
3183 unsigned olen;
3184
3185 olen = option[IPOPT_OLEN];
3186#if DIAGNOSTIC
0a7de745 3187 if (ipprintfs) {
1c79356b 3188 printf("save_rte: olen %d\n", olen);
0a7de745 3189 }
1c79356b 3190#endif
0a7de745 3191 if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) {
1c79356b 3192 return;
0a7de745 3193 }
1c79356b 3194 bcopy(option, ip_srcrt.srcopt, olen);
0a7de745 3195 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1c79356b
A
3196 ip_srcrt.dst = dst;
3197}
3198
3199/*
3200 * Retrieve incoming source route for use in replies,
3201 * in the same form used by setsockopt.
3202 * The first hop is placed before the options, will be removed later.
3203 */
3204struct mbuf *
2d21ac55 3205ip_srcroute(void)
1c79356b 3206{
2d21ac55
A
3207 struct in_addr *p, *q;
3208 struct mbuf *m;
1c79356b 3209
0a7de745
A
3210 if (ip_nhops == 0) {
3211 return NULL;
3212 }
39236c6e 3213
1c79356b 3214 m = m_get(M_DONTWAIT, MT_HEADER);
0a7de745
A
3215 if (m == NULL) {
3216 return NULL;
3217 }
1c79356b 3218
0a7de745 3219#define OPTSIZ (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt))
1c79356b
A
3220
3221 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
0a7de745
A
3222 m->m_len = ip_nhops * sizeof(struct in_addr) +
3223 sizeof(struct in_addr) + OPTSIZ;
1c79356b 3224#if DIAGNOSTIC
0a7de745 3225 if (ipprintfs) {
1c79356b 3226 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
0a7de745 3227 }
1c79356b
A
3228#endif
3229
3230 /*
3231 * First save first hop for return route
3232 */
3233 p = &ip_srcrt.route[ip_nhops - 1];
3234 *(mtod(m, struct in_addr *)) = *p--;
3235#if DIAGNOSTIC
0a7de745 3236 if (ipprintfs) {
39236c6e
A
3237 printf(" hops %lx",
3238 (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr));
0a7de745 3239 }
1c79356b
A
3240#endif
3241
3242 /*
3243 * Copy option fields and padding (nop) to mbuf.
3244 */
3245 ip_srcrt.nop = IPOPT_NOP;
3246 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
0a7de745 3247 (void) memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
1c79356b 3248 &ip_srcrt.nop, OPTSIZ);
316670eb 3249 q = (struct in_addr *)(void *)(mtod(m, caddr_t) +
0a7de745 3250 sizeof(struct in_addr) + OPTSIZ);
1c79356b
A
3251#undef OPTSIZ
3252 /*
3253 * Record return path as an IP source route,
3254 * reversing the path (pointers are now aligned).
3255 */
3256 while (p >= ip_srcrt.route) {
3257#if DIAGNOSTIC
0a7de745 3258 if (ipprintfs) {
b0d623f7 3259 printf(" %lx", (u_int32_t)ntohl(q->s_addr));
0a7de745 3260 }
1c79356b
A
3261#endif
3262 *q++ = *p--;
3263 }
3264 /*
3265 * Last hop goes to final destination.
3266 */
3267 *q = ip_srcrt.dst;
3268#if DIAGNOSTIC
0a7de745 3269 if (ipprintfs) {
b0d623f7 3270 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr));
0a7de745 3271 }
1c79356b 3272#endif
0a7de745 3273 return m;
1c79356b
A
3274}
3275
3276/*
5ba3f43e 3277 * Strip out IP options, at higher level protocol in the kernel.
1c79356b
A
3278 */
3279void
5ba3f43e 3280ip_stripoptions(struct mbuf *m)
1c79356b 3281{
2d21ac55 3282 int i;
1c79356b 3283 struct ip *ip = mtod(m, struct ip *);
2d21ac55 3284 caddr_t opts;
1c79356b
A
3285 int olen;
3286
316670eb
A
3287 /* Expect 32-bit aligned data pointer on strict-align platforms */
3288 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3289
5ba3f43e 3290 /* use bcopy() since it supports overlapping range */
0a7de745 3291 olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1c79356b 3292 opts = (caddr_t)(ip + 1);
0a7de745 3293 i = m->m_len - (sizeof(struct ip) + olen);
1c79356b
A
3294 bcopy(opts + olen, opts, (unsigned)i);
3295 m->m_len -= olen;
0a7de745 3296 if (m->m_flags & M_PKTHDR) {
1c79356b 3297 m->m_pkthdr.len -= olen;
0a7de745
A
3298 }
3299 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
5ba3f43e
A
3300
3301 /*
3302 * We expect ip_{off,len} to be in host order by now, and
3303 * that the original IP header length has been subtracted
3304 * out from ip_len. Temporarily adjust ip_len for checksum
3305 * recalculation, and restore it afterwards.
3306 */
0a7de745 3307 ip->ip_len += sizeof(struct ip);
5ba3f43e
A
3308
3309 /* recompute checksum now that IP header is smaller */
3310#if BYTE_ORDER != BIG_ENDIAN
3311 HTONS(ip->ip_len);
3312 HTONS(ip->ip_off);
3313#endif /* BYTE_ORDER != BIG_ENDIAN */
3314 ip->ip_sum = in_cksum_hdr(ip);
3315#if BYTE_ORDER != BIG_ENDIAN
3316 NTOHS(ip->ip_off);
3317 NTOHS(ip->ip_len);
3318#endif /* BYTE_ORDER != BIG_ENDIAN */
3319
0a7de745 3320 ip->ip_len -= sizeof(struct ip);
cb323159
A
3321
3322 /*
3323 * Given that we've just stripped IP options from the header,
3324 * we need to adjust the start offset accordingly if this
3325 * packet had gone thru partial checksum offload.
3326 */
3327 if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3328 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3329 if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) {
3330 /* most common case */
3331 m->m_pkthdr.csum_rx_start -= olen;
3332 } else {
3333 /* compute checksum in software instead */
3334 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3335 m->m_pkthdr.csum_data = 0;
3336 ipstat.ips_adj_hwcsum_clr++;
3337 }
3338 }
1c79356b
A
3339}
3340
3341u_char inetctlerrmap[PRC_NCMDS] = {
0a7de745
A
3342 0, 0, 0, 0,
3343 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
3344 ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
3345 EMSGSIZE, EHOSTUNREACH, 0, 0,
cb323159 3346 0, 0, EHOSTUNREACH, 0,
0a7de745 3347 ENOPROTOOPT, ECONNREFUSED
1c79356b
A
3348};
3349
b0d623f7
A
3350static int
3351sysctl_ipforwarding SYSCTL_HANDLER_ARGS
3352{
3353#pragma unused(arg1, arg2)
3354 int i, was_ipforwarding = ipforwarding;
3355
3356 i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
0a7de745
A
3357 if (i != 0 || req->newptr == USER_ADDR_NULL) {
3358 return i;
3359 }
b0d623f7
A
3360
3361 if (was_ipforwarding && !ipforwarding) {
3362 /* clean up IPv4 forwarding cached routes */
3363 ifnet_head_lock_shared();
3364 for (i = 0; i <= if_index; i++) {
3365 struct ifnet *ifp = ifindex2ifnet[i];
3366 if (ifp != NULL) {
6d2010ae 3367 lck_mtx_lock(&ifp->if_cached_route_lock);
39236c6e 3368 ROUTE_RELEASE(&ifp->if_fwd_route);
6d2010ae 3369 bzero(&ifp->if_fwd_route,
0a7de745 3370 sizeof(ifp->if_fwd_route));
6d2010ae 3371 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3372 }
3373 }
3374 ifnet_head_done();
3375 }
3376
0a7de745 3377 return 0;
b0d623f7
A
3378}
3379
3380/*
3381 * Similar to inp_route_{copyout,copyin} routines except that these copy
3382 * out the cached IPv4 forwarding route from struct ifnet instead of the
3383 * inpcb. See comments for those routines for explanations.
3384 */
3385static void
3386ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst)
3387{
3388 struct route *src = &ifp->if_fwd_route;
3389
6d2010ae
A
3390 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3391 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
b0d623f7
A
3392
3393 /* Minor sanity check */
0a7de745 3394 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
b0d623f7 3395 panic("%s: wrong or corrupted route: %p", __func__, src);
0a7de745 3396 }
b0d623f7 3397
0a7de745 3398 route_copyout(dst, src, sizeof(*dst));
b0d623f7 3399
6d2010ae 3400 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3401}
3402
3403static void
3404ip_fwd_route_copyin(struct ifnet *ifp, struct route *src)
3405{
3406 struct route *dst = &ifp->if_fwd_route;
3407
6d2010ae
A
3408 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3409 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
b0d623f7
A
3410
3411 /* Minor sanity check */
0a7de745 3412 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
b0d623f7 3413 panic("%s: wrong or corrupted route: %p", __func__, src);
0a7de745 3414 }
b0d623f7 3415
0a7de745
A
3416 if (ifp->if_fwd_cacheok) {
3417 route_copyin(src, dst, sizeof(*src));
3418 }
b0d623f7 3419
6d2010ae 3420 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3421}
3422
1c79356b
A
3423/*
3424 * Forward a packet. If some error occurs return the sender
3425 * an icmp packet. Note we can't always generate a meaningful
3426 * icmp message because icmp doesn't have a large enough repertoire
3427 * of codes and types.
3428 *
3429 * If not forwarding, just drop the packet. This could be confusing
3430 * if ipforwarding was zero but some routing protocol was advancing
3431 * us as a gateway to somewhere. However, we must let the routing
3432 * protocol deal with that.
3433 *
3434 * The srcrt parameter indicates whether the packet is being forwarded
3435 * via a source route.
3436 */
9bccf70c 3437static void
b0d623f7 3438ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
1c79356b 3439{
b0d623f7 3440#pragma unused(next_hop)
2d21ac55
A
3441 struct ip *ip = mtod(m, struct ip *);
3442 struct sockaddr_in *sin;
3443 struct rtentry *rt;
b0d623f7 3444 struct route fwd_rt;
1c79356b
A
3445 int error, type = 0, code = 0;
3446 struct mbuf *mcopy;
3447 n_long dest;
91447636 3448 struct in_addr pkt_dst;
39236c6e 3449 u_int32_t nextmtu = 0, len;
a39ff7e2 3450 struct ip_out_args ipoa;
39236c6e 3451 struct ifnet *rcvifp = m->m_pkthdr.rcvif;
a39ff7e2
A
3452
3453 bzero(&ipoa, sizeof(ipoa));
3454 ipoa.ipoa_boundif = IFSCOPE_NONE;
3455 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3456 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3457
39236c6e
A
3458#if IPSEC
3459 struct secpolicy *sp = NULL;
3460 int ipsecerror;
3461#endif /* IPSEC */
b0d623f7
A
3462#if PF
3463 struct pf_mtag *pf_mtag;
3464#endif /* PF */
1c79356b
A
3465
3466 dest = 0;
b0d623f7 3467 pkt_dst = ip->ip_dst;
91447636 3468
1c79356b 3469#if DIAGNOSTIC
0a7de745 3470 if (ipprintfs) {
1c79356b 3471 printf("forward: src %lx dst %lx ttl %x\n",
b0d623f7 3472 (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr,
1c79356b 3473 ip->ip_ttl);
0a7de745 3474 }
1c79356b
A
3475#endif
3476
0a7de745 3477 if (m->m_flags & (M_BCAST | M_MCAST) || !in_canforward(pkt_dst)) {
b0d623f7 3478 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b
A
3479 m_freem(m);
3480 return;
3481 }
9bccf70c
A
3482#if IPSTEALTH
3483 if (!ipstealth) {
39236c6e 3484#endif /* IPSTEALTH */
0a7de745
A
3485 if (ip->ip_ttl <= IPTTLDEC) {
3486 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
3487 dest, 0);
3488 return;
1c79356b 3489 }
0a7de745
A
3490#if IPSTEALTH
3491}
39236c6e 3492#endif /* IPSTEALTH */
1c79356b 3493
b0d623f7
A
3494#if PF
3495 pf_mtag = pf_find_mtag(m);
316670eb
A
3496 if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
3497 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid;
3498 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
3499 }
b0d623f7
A
3500#endif /* PF */
3501
39236c6e
A
3502 ip_fwd_route_copyout(rcvifp, &fwd_rt);
3503
3504 sin = SIN(&fwd_rt.ro_dst);
3505 if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) {
3506 ROUTE_RELEASE(&fwd_rt);
b0d623f7 3507
1c79356b 3508 sin->sin_family = AF_INET;
0a7de745 3509 sin->sin_len = sizeof(*sin);
91447636 3510 sin->sin_addr = pkt_dst;
1c79356b 3511
6d2010ae 3512 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif);
b0d623f7 3513 if (fwd_rt.ro_rt == NULL) {
1c79356b 3514 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
b0d623f7 3515 goto done;
1c79356b 3516 }
1c79356b 3517 }
b0d623f7 3518 rt = fwd_rt.ro_rt;
1c79356b
A
3519
3520 /*
9bccf70c
A
3521 * Save the IP header and at most 8 bytes of the payload,
3522 * in case we need to generate an ICMP message to the src.
3523 *
3524 * We don't use m_copy() because it might return a reference
3525 * to a shared cluster. Both this function and ip_output()
3526 * assume exclusive access to the IP header in `m', so any
3527 * data in a cluster may change before we reach icmp_error().
1c79356b 3528 */
9bccf70c
A
3529 MGET(mcopy, M_DONTWAIT, m->m_type);
3530 if (mcopy != NULL) {
3531 M_COPY_PKTHDR(mcopy, m);
3532 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
3533 (int)ip->ip_len);
3534 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
3535 }
3536
3537#if IPSTEALTH
3538 if (!ipstealth) {
39236c6e 3539#endif /* IPSTEALTH */
0a7de745 3540 ip->ip_ttl -= IPTTLDEC;
9bccf70c 3541#if IPSTEALTH
0a7de745 3542}
39236c6e 3543#endif /* IPSTEALTH */
1c79356b
A
3544
3545 /*
3546 * If forwarding packet using same interface that it came in on,
3547 * perhaps should send a redirect to sender to shortcut a hop.
3548 * Only send redirect if source is sending directly to us,
3549 * and if packet was not source routed (or has any options).
3550 * Also, don't send redirect if forwarding using a default route
3551 * or a route modified by a redirect.
3552 */
b0d623f7 3553 RT_LOCK_SPIN(rt);
1c79356b 3554 if (rt->rt_ifp == m->m_pkthdr.rcvif &&
0a7de745 3555 !(rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED)) &&
39236c6e 3556 satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
6d2010ae
A
3557 ipsendredirects && !srcrt && rt->rt_ifa != NULL) {
3558 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa;
b0d623f7 3559 u_int32_t src = ntohl(ip->ip_src.s_addr);
1c79356b 3560
6d2010ae
A
3561 /* Become a regular mutex */
3562 RT_CONVERT_LOCK(rt);
3563 IFA_LOCK_SPIN(&ia->ia_ifa);
3564 if ((src & ia->ia_subnetmask) == ia->ia_subnet) {
0a7de745 3565 if (rt->rt_flags & RTF_GATEWAY) {
6d2010ae 3566 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
0a7de745 3567 } else {
6d2010ae 3568 dest = pkt_dst.s_addr;
0a7de745 3569 }
39236c6e
A
3570 /*
3571 * Router requirements says to only send
3572 * host redirects.
3573 */
6d2010ae
A
3574 type = ICMP_REDIRECT;
3575 code = ICMP_REDIRECT_HOST;
1c79356b 3576#if DIAGNOSTIC
0a7de745 3577 if (ipprintfs) {
39236c6e
A
3578 printf("redirect (%d) to %lx\n", code,
3579 (u_int32_t)dest);
0a7de745 3580 }
1c79356b
A
3581#endif
3582 }
6d2010ae 3583 IFA_UNLOCK(&ia->ia_ifa);
1c79356b 3584 }
b0d623f7 3585 RT_UNLOCK(rt);
1c79356b 3586
39236c6e
A
3587
3588 /* Mark this packet as being forwarded from another interface */
3589 m->m_pkthdr.pkt_flags |= PKTF_FORWARDED;
3590 len = m_pktlen(m);
3591
3592 error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS,
3593 NULL, &ipoa);
b0d623f7
A
3594
3595 /* Refresh rt since the route could have changed while in IP */
3596 rt = fwd_rt.ro_rt;
3597
39236c6e 3598 if (error != 0) {
b0d623f7
A
3599 OSAddAtomic(1, &ipstat.ips_cantforward);
3600 } else {
39236c6e
A
3601 /*
3602 * Increment stats on the source interface; the ones
3603 * for destination interface has been taken care of
3604 * during output above by virtue of PKTF_FORWARDED.
3605 */
3606 rcvifp->if_fpackets++;
3607 rcvifp->if_fbytes += len;
3608
b0d623f7 3609 OSAddAtomic(1, &ipstat.ips_forward);
39236c6e 3610 if (type != 0) {
b0d623f7 3611 OSAddAtomic(1, &ipstat.ips_redirectsent);
39236c6e
A
3612 } else {
3613 if (mcopy != NULL) {
b0d623f7
A
3614 /*
3615 * If we didn't have to go thru ipflow and
3616 * the packet was successfully consumed by
3617 * ip_output, the mcopy is rather a waste;
3618 * this could be further optimized.
3619 */
1c79356b
A
3620 m_freem(mcopy);
3621 }
b0d623f7 3622 goto done;
1c79356b
A
3623 }
3624 }
0a7de745 3625 if (mcopy == NULL) {
b0d623f7 3626 goto done;
0a7de745 3627 }
1c79356b
A
3628
3629 switch (error) {
0a7de745 3630 case 0: /* forwarded, but need redirect */
1c79356b
A
3631 /* type, code set above */
3632 break;
3633
0a7de745 3634 case ENETUNREACH: /* shouldn't happen, checked above */
1c79356b
A
3635 case EHOSTUNREACH:
3636 case ENETDOWN:
3637 case EHOSTDOWN:
3638 default:
3639 type = ICMP_UNREACH;
3640 code = ICMP_UNREACH_HOST;
3641 break;
3642
3643 case EMSGSIZE:
3644 type = ICMP_UNREACH;
3645 code = ICMP_UNREACH_NEEDFRAG;
39236c6e
A
3646
3647 if (rt == NULL) {
3648 break;
3649 } else {
b0d623f7 3650 RT_LOCK_SPIN(rt);
0a7de745 3651 if (rt->rt_ifp != NULL) {
b0d623f7 3652 nextmtu = rt->rt_ifp->if_mtu;
0a7de745 3653 }
b0d623f7
A
3654 RT_UNLOCK(rt);
3655 }
39236c6e 3656#ifdef IPSEC
0a7de745 3657 if (ipsec_bypass) {
39236c6e 3658 break;
0a7de745 3659 }
39236c6e 3660
1c79356b
A
3661 /*
3662 * If the packet is routed over IPsec tunnel, tell the
3663 * originator the tunnel MTU.
3664 * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
3665 * XXX quickhack!!!
3666 */
39236c6e
A
3667 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
3668 IP_FORWARDING, &ipsecerror);
1c79356b 3669
0a7de745 3670 if (sp == NULL) {
39236c6e 3671 break;
0a7de745 3672 }
b0d623f7 3673
39236c6e
A
3674 /*
3675 * find the correct route for outer IPv4
3676 * header, compute tunnel MTU.
3677 */
3678 nextmtu = 0;
1c79356b 3679
39236c6e
A
3680 if (sp->req != NULL &&
3681 sp->req->saidx.mode == IPSEC_MODE_TUNNEL) {
3682 struct secasindex saidx;
3683 struct secasvar *sav;
3684 struct route *ro;
3685 struct ip *ipm;
3686 int ipsechdr;
1c79356b 3687
39236c6e
A
3688 /* count IPsec header size */
3689 ipsechdr = ipsec_hdrsiz(sp);
3690
3691 ipm = mtod(mcopy, struct ip *);
0a7de745 3692 bcopy(&sp->req->saidx, &saidx, sizeof(saidx));
39236c6e
A
3693 saidx.mode = sp->req->saidx.mode;
3694 saidx.reqid = sp->req->saidx.reqid;
3695 sin = SIN(&saidx.src);
3696 if (sin->sin_len == 0) {
0a7de745 3697 sin->sin_len = sizeof(*sin);
39236c6e
A
3698 sin->sin_family = AF_INET;
3699 sin->sin_port = IPSEC_PORT_ANY;
3700 bcopy(&ipm->ip_src, &sin->sin_addr,
0a7de745 3701 sizeof(sin->sin_addr));
39236c6e
A
3702 }
3703 sin = SIN(&saidx.dst);
3704 if (sin->sin_len == 0) {
0a7de745 3705 sin->sin_len = sizeof(*sin);
39236c6e
A
3706 sin->sin_family = AF_INET;
3707 sin->sin_port = IPSEC_PORT_ANY;
3708 bcopy(&ipm->ip_dst, &sin->sin_addr,
0a7de745 3709 sizeof(sin->sin_addr));
39236c6e
A
3710 }
3711 sav = key_allocsa_policy(&saidx);
3712 if (sav != NULL) {
3713 lck_mtx_lock(sadb_mutex);
3714 if (sav->sah != NULL) {
5c9f4661 3715 ro = (struct route *)&sav->sah->sa_route;
39236c6e
A
3716 if (ro->ro_rt != NULL) {
3717 RT_LOCK(ro->ro_rt);
3718 if (ro->ro_rt->rt_ifp != NULL) {
3719 nextmtu = ro->ro_rt->
3720 rt_ifp->if_mtu;
3721 nextmtu -= ipsechdr;
2d21ac55 3722 }
39236c6e 3723 RT_UNLOCK(ro->ro_rt);
1c79356b
A
3724 }
3725 }
39236c6e
A
3726 key_freesav(sav, KEY_SADB_LOCKED);
3727 lck_mtx_unlock(sadb_mutex);
1c79356b
A
3728 }
3729 }
39236c6e
A
3730 key_freesp(sp, KEY_SADB_UNLOCKED);
3731#endif /* IPSEC */
1c79356b
A
3732 break;
3733
3734 case ENOBUFS:
39236c6e
A
3735 /*
3736 * A router should not generate ICMP_SOURCEQUENCH as
3737 * required in RFC1812 Requirements for IP Version 4 Routers.
3738 * Source quench could be a big problem under DoS attacks,
3739 * or if the underlying interface is rate-limited.
3740 * Those who need source quench packets may re-enable them
3741 * via the net.inet.ip.sendsourcequench sysctl.
3742 */
3743 if (ip_sendsourcequench == 0) {
3744 m_freem(mcopy);
3745 goto done;
3746 } else {
3747 type = ICMP_SOURCEQUENCH;
3748 code = 0;
3749 }
1c79356b 3750 break;
9bccf70c 3751
f427ee49 3752 case EACCES:
9bccf70c 3753 m_freem(mcopy);
b0d623f7 3754 goto done;
1c79356b 3755 }
b0d623f7 3756
0a7de745 3757 if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) {
39236c6e 3758 OSAddAtomic(1, &ipstat.ips_cantfrag);
0a7de745 3759 }
39236c6e 3760
b0d623f7
A
3761 icmp_error(mcopy, type, code, dest, nextmtu);
3762done:
39236c6e 3763 ip_fwd_route_copyin(rcvifp, &fwd_rt);
1c79356b
A
3764}
3765
6d2010ae 3766int
39236c6e
A
3767ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
3768 struct mbuf *m)
1c79356b 3769{
6d2010ae 3770 *mp = NULL;
1c79356b
A
3771 if (inp->inp_socket->so_options & SO_TIMESTAMP) {
3772 struct timeval tv;
3773
39236c6e 3774 getmicrotime(&tv);
0a7de745 3775 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof(tv),
39236c6e 3776 SCM_TIMESTAMP, SOL_SOCKET, mp);
6d2010ae
A
3777 if (*mp == NULL) {
3778 goto no_mbufs;
3779 }
1c79356b 3780 }
39236c6e 3781 if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) {
6d2010ae
A
3782 uint64_t time;
3783
3784 time = mach_absolute_time();
0a7de745 3785 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
39236c6e 3786 SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp);
6d2010ae
A
3787 if (*mp == NULL) {
3788 goto no_mbufs;
3789 }
39236c6e 3790 }
d9a64523
A
3791 if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) {
3792 uint64_t time;
3793
3794 time = mach_continuous_time();
0a7de745
A
3795 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
3796 SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp);
d9a64523
A
3797 if (*mp == NULL) {
3798 goto no_mbufs;
3799 }
3800 }
f427ee49
A
3801 if (inp->inp_flags & INP_RECVDSTADDR
3802#if CONTENT_FILTER
3803 /* Content Filter needs to see local address */
3804 || (inp->inp_socket->so_cfil_db != NULL)
3805#endif
3806 ) {
39236c6e 3807 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst,
0a7de745 3808 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp);
6d2010ae
A
3809 if (*mp == NULL) {
3810 goto no_mbufs;
3811 }
1c79356b
A
3812 }
3813#ifdef notyet
39236c6e
A
3814 /*
3815 * XXX
1c79356b
A
3816 * Moving these out of udp_input() made them even more broken
3817 * than they already were.
3818 */
3819 /* options were tossed already */
3820 if (inp->inp_flags & INP_RECVOPTS) {
39236c6e 3821 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above,
0a7de745 3822 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp);
6d2010ae
A
3823 if (*mp == NULL) {
3824 goto no_mbufs;
3825 }
1c79356b
A
3826 }
3827 /* ip_srcroute doesn't do what we want here, need to fix */
3828 if (inp->inp_flags & INP_RECVRETOPTS) {
39236c6e 3829 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(),
0a7de745 3830 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp);
6d2010ae
A
3831 if (*mp == NULL) {
3832 goto no_mbufs;
3833 }
1c79356b 3834 }
39236c6e 3835#endif /* notyet */
1c79356b
A
3836 if (inp->inp_flags & INP_RECVIF) {
3837 struct ifnet *ifp;
39236c6e
A
3838 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1];
3839 struct sockaddr_dl *sdl2 = SDL(&sdlbuf);
3840
3841 /*
3842 * Make sure to accomodate the largest possible
3843 * size of SA(if_lladdr)->sa_len.
3844 */
0a7de745 3845 _CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1));
1c79356b 3846
91447636 3847 ifnet_head_lock_shared();
6d2010ae
A
3848 if ((ifp = m->m_pkthdr.rcvif) != NULL &&
3849 ifp->if_index && (ifp->if_index <= if_index)) {
13fec989 3850 struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1];
39236c6e 3851 struct sockaddr_dl *sdp;
2d21ac55 3852
0a7de745 3853 if (!ifa || !ifa->ifa_addr) {
13fec989 3854 goto makedummy;
0a7de745 3855 }
2d21ac55 3856
6d2010ae 3857 IFA_LOCK_SPIN(ifa);
39236c6e 3858 sdp = SDL(ifa->ifa_addr);
1c79356b
A
3859 /*
3860 * Change our mind and don't try copy.
3861 */
39236c6e 3862 if (sdp->sdl_family != AF_LINK) {
6d2010ae 3863 IFA_UNLOCK(ifa);
1c79356b
A
3864 goto makedummy;
3865 }
39236c6e 3866 /* the above _CASSERT ensures sdl_len fits in sdlbuf */
1c79356b 3867 bcopy(sdp, sdl2, sdp->sdl_len);
6d2010ae 3868 IFA_UNLOCK(ifa);
1c79356b 3869 } else {
6d2010ae 3870makedummy:
39236c6e
A
3871 sdl2->sdl_len =
3872 offsetof(struct sockaddr_dl, sdl_data[0]);
1c79356b
A
3873 sdl2->sdl_family = AF_LINK;
3874 sdl2->sdl_index = 0;
3875 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
3876 }
91447636 3877 ifnet_head_done();
39236c6e
A
3878 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len,
3879 IP_RECVIF, IPPROTO_IP, mp);
6d2010ae
A
3880 if (*mp == NULL) {
3881 goto no_mbufs;
3882 }
1c79356b 3883 }
55e303ae 3884 if (inp->inp_flags & INP_RECVTTL) {
39236c6e 3885 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl,
0a7de745 3886 sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp);
6d2010ae
A
3887 if (*mp == NULL) {
3888 goto no_mbufs;
3889 }
3890 }
39236c6e 3891 if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) {
316670eb
A
3892 int tc = m_get_traffic_class(m);
3893
0a7de745 3894 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof(tc),
39236c6e 3895 SO_TRAFFIC_CLASS, SOL_SOCKET, mp);
6d2010ae
A
3896 if (*mp == NULL) {
3897 goto no_mbufs;
3898 }
3899 }
3900 if (inp->inp_flags & INP_PKTINFO) {
3901 struct in_pktinfo pi;
3902
0a7de745
A
3903 bzero(&pi, sizeof(struct in_pktinfo));
3904 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr));
39236c6e
A
3905 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ?
3906 m->m_pkthdr.rcvif->if_index : 0;
3907
3908 mp = sbcreatecontrol_mbuf((caddr_t)&pi,
0a7de745 3909 sizeof(struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp);
6d2010ae
A
3910 if (*mp == NULL) {
3911 goto no_mbufs;
3912 }
55e303ae 3913 }
813fb2f6
A
3914 if (inp->inp_flags & INP_RECVTOS) {
3915 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_tos,
3916 sizeof(u_char), IP_RECVTOS, IPPROTO_IP, mp);
3917 if (*mp == NULL) {
3918 goto no_mbufs;
3919 }
3920 }
0a7de745 3921 return 0;
6d2010ae
A
3922
3923no_mbufs:
3924 ipstat.ips_pktdropcntrl++;
0a7de745 3925 return ENOBUFS;
1c79356b
A
3926}
3927
316670eb
A
3928static inline u_short
3929ip_cksum(struct mbuf *m, int hlen)
3930{
316670eb 3931 u_short sum;
316670eb
A
3932
3933 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
3934 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
39236c6e
A
3935 } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
3936 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
316670eb 3937 /*
39236c6e
A
3938 * The packet arrived on an interface which isn't capable
3939 * of performing IP header checksum; compute it now.
316670eb 3940 */
39236c6e 3941 sum = ip_cksum_hdr_in(m, hlen);
316670eb 3942 } else {
316670eb 3943 sum = 0;
39236c6e
A
3944 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3945 CSUM_IP_CHECKED | CSUM_IP_VALID);
3946 m->m_pkthdr.csum_data = 0xffff;
316670eb
A
3947 }
3948
0a7de745 3949 if (sum != 0) {
316670eb 3950 OSAddAtomic(1, &ipstat.ips_badsum);
0a7de745 3951 }
39236c6e 3952
0a7de745 3953 return sum;
39236c6e
A
3954}
3955
3956static int
3957ip_getstat SYSCTL_HANDLER_ARGS
3958{
3959#pragma unused(oidp, arg1, arg2)
0a7de745
A
3960 if (req->oldptr == USER_ADDR_NULL) {
3961 req->oldlen = (size_t)sizeof(struct ipstat);
3962 }
39236c6e 3963
0a7de745 3964 return SYSCTL_OUT(req, &ipstat, MIN(sizeof(ipstat), req->oldlen));
39236c6e
A
3965}
3966
3967void
3968ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia)
3969{
3970 VERIFY(m->m_flags & M_PKTHDR);
3971
3972 /*
3973 * If the source ifaddr is specified, pick up the information
3974 * from there; otherwise just grab the passed-in ifindex as the
3975 * caller may not have the ifaddr available.
3976 */
3977 if (ia != NULL) {
3978 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
3979 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index;
3980 } else {
3981 m->m_pkthdr.src_ifindex = src_idx;
0a7de745 3982 if (src_idx != 0) {
39236c6e 3983 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
0a7de745 3984 }
39236c6e
A
3985 }
3986}
3987
3988void
3989ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia)
3990{
3991 VERIFY(m->m_flags & M_PKTHDR);
3992
3993 /*
3994 * If the destination ifaddr is specified, pick up the information
3995 * from there; otherwise just grab the passed-in ifindex as the
3996 * caller may not have the ifaddr available.
3997 */
3998 if (ia != NULL) {
3999 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4000 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index;
4001 } else {
4002 m->m_pkthdr.dst_ifindex = dst_idx;
0a7de745 4003 if (dst_idx != 0) {
39236c6e 4004 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
0a7de745 4005 }
39236c6e
A
4006 }
4007}
4008
4009int
4010ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf)
4011{
4012 VERIFY(m->m_flags & M_PKTHDR);
4013
0a7de745
A
4014 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4015 return -1;
4016 }
39236c6e 4017
0a7de745 4018 if (src_idx != NULL) {
39236c6e 4019 *src_idx = m->m_pkthdr.src_ifindex;
0a7de745 4020 }
39236c6e 4021
0a7de745 4022 if (iaf != NULL) {
39236c6e 4023 *iaf = 0;
0a7de745 4024 }
39236c6e 4025
0a7de745 4026 return 0;
39236c6e
A
4027}
4028
4029int
4030ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf)
4031{
4032 VERIFY(m->m_flags & M_PKTHDR);
4033
0a7de745
A
4034 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4035 return -1;
4036 }
39236c6e 4037
0a7de745 4038 if (dst_idx != NULL) {
39236c6e 4039 *dst_idx = m->m_pkthdr.dst_ifindex;
0a7de745 4040 }
39236c6e 4041
0a7de745 4042 if (iaf != NULL) {
39236c6e 4043 *iaf = 0;
0a7de745 4044 }
39236c6e 4045
0a7de745 4046 return 0;
39236c6e
A
4047}
4048
4049/*
4050 * Protocol input handler for IPPROTO_GRE.
4051 */
4052void
4053gre_input(struct mbuf *m, int off)
4054{
4055 gre_input_func_t fn = gre_input_func;
4056
4057 /*
4058 * If there is a registered GRE input handler, pass mbuf to it.
4059 */
4060 if (fn != NULL) {
4061 lck_mtx_unlock(inet_domain_mutex);
4062 m = fn(m, off, (mtod(m, struct ip *))->ip_p);
4063 lck_mtx_lock(inet_domain_mutex);
316670eb
A
4064 }
4065
39236c6e
A
4066 /*
4067 * If no matching tunnel that is up is found, we inject
4068 * the mbuf to raw ip socket to see if anyone picks it up.
4069 */
0a7de745 4070 if (m != NULL) {
39236c6e 4071 rip_input(m, off);
0a7de745 4072 }
39236c6e
A
4073}
4074
4075/*
4076 * Private KPI for PPP/PPTP.
4077 */
4078int
4079ip_gre_register_input(gre_input_func_t fn)
4080{
4081 lck_mtx_lock(inet_domain_mutex);
4082 gre_input_func = fn;
4083 lck_mtx_unlock(inet_domain_mutex);
4084
0a7de745 4085 return 0;
316670eb 4086}
3e170ce0 4087
39037602 4088#if (DEBUG || DEVELOPMENT)
3e170ce0
A
4089static int
4090sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS
4091{
4092#pragma unused(arg1, arg2)
4093 int error, i;
4094
4095 i = ip_input_measure;
4096 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 4097 if (error || req->newptr == USER_ADDR_NULL) {
3e170ce0 4098 goto done;
0a7de745 4099 }
3e170ce0
A
4100 /* impose bounds */
4101 if (i < 0 || i > 1) {
4102 error = EINVAL;
4103 goto done;
4104 }
4105 if (ip_input_measure != i && i == 1) {
4106 net_perf_initialize(&net_perf, ip_input_measure_bins);
4107 }
4108 ip_input_measure = i;
4109done:
0a7de745 4110 return error;
3e170ce0
A
4111}
4112
4113static int
4114sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS
4115{
4116#pragma unused(arg1, arg2)
4117 int error;
4118 uint64_t i;
4119
4120 i = ip_input_measure_bins;
4121 error = sysctl_handle_quad(oidp, &i, 0, req);
0a7de745 4122 if (error || req->newptr == USER_ADDR_NULL) {
3e170ce0 4123 goto done;
0a7de745 4124 }
3e170ce0
A
4125 /* validate data */
4126 if (!net_perf_validate_bins(i)) {
4127 error = EINVAL;
4128 goto done;
4129 }
4130 ip_input_measure_bins = i;
4131done:
0a7de745 4132 return error;
3e170ce0
A
4133}
4134
4135static int
4136sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS
4137{
4138#pragma unused(oidp, arg1, arg2)
0a7de745
A
4139 if (req->oldptr == USER_ADDR_NULL) {
4140 req->oldlen = (size_t)sizeof(struct ipstat);
4141 }
3e170ce0 4142
0a7de745 4143 return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3e170ce0 4144}
39037602 4145#endif /* (DEBUG || DEVELOPMENT) */
eb6b6ca3
A
4146
4147static int
4148sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS
4149{
4150#pragma unused(arg1, arg2)
4151 int error, i;
4152
4153 i = ip_checkinterface;
4154 error = sysctl_handle_int(oidp, &i, 0, req);
4155 if (error != 0 || req->newptr == USER_ADDR_NULL) {
4156 return error;
4157 }
4158
4159 switch (i) {
4160 case IP_CHECKINTERFACE_WEAK_ES:
4161 case IP_CHECKINTERFACE_HYBRID_ES:
4162 case IP_CHECKINTERFACE_STRONG_ES:
4163 if (ip_checkinterface != i) {
4164 ip_checkinterface = i;
4165 os_log(OS_LOG_DEFAULT, "%s: ip_checkinterface is now %d\n",
4166 __func__, ip_checkinterface);
4167 }
4168 break;
4169 default:
4170 error = EINVAL;
4171 break;
4172 }
4173 return error;
4174}