]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/ip_input.c
xnu-6153.81.5.tar.gz
[apple/xnu.git] / bsd / netinet / ip_input.c
CommitLineData
1c79356b 1/*
cb323159 2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
1c79356b 61 */
2d21ac55
A
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
1c79356b 68
0a7de745 69#define _IP_VHL
1c79356b 70
1c79356b
A
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/mbuf.h>
74#include <sys/malloc.h>
75#include <sys/domain.h>
76#include <sys/protosw.h>
77#include <sys/socket.h>
78#include <sys/time.h>
79#include <sys/kernel.h>
80#include <sys/syslog.h>
81#include <sys/sysctl.h>
6d2010ae 82#include <sys/mcache.h>
39236c6e
A
83#include <sys/socketvar.h>
84#include <sys/kdebug.h>
6d2010ae 85#include <mach/mach_time.h>
39236c6e 86#include <mach/sdt.h>
1c79356b 87
b0d623f7 88#include <machine/endian.h>
39236c6e 89#include <dev/random/randomdev.h>
b0d623f7 90
1c79356b 91#include <kern/queue.h>
91447636 92#include <kern/locks.h>
39236c6e 93#include <libkern/OSAtomic.h>
1c79356b 94
2d21ac55
A
95#include <pexpert/pexpert.h>
96
1c79356b
A
97#include <net/if.h>
98#include <net/if_var.h>
99#include <net/if_dl.h>
100#include <net/route.h>
91447636 101#include <net/kpi_protocol.h>
6d2010ae 102#include <net/ntstat.h>
39236c6e
A
103#include <net/dlil.h>
104#include <net/classq/classq.h>
3e170ce0 105#include <net/net_perf.h>
39037602 106#include <net/init.h>
39236c6e
A
107#if PF
108#include <net/pfvar.h>
109#endif /* PF */
1c79356b
A
110
111#include <netinet/in.h>
112#include <netinet/in_systm.h>
113#include <netinet/in_var.h>
b0d623f7 114#include <netinet/in_arp.h>
1c79356b 115#include <netinet/ip.h>
1c79356b
A
116#include <netinet/in_pcb.h>
117#include <netinet/ip_var.h>
118#include <netinet/ip_icmp.h>
9bccf70c 119#include <netinet/ip_fw.h>
91447636 120#include <netinet/ip_divert.h>
91447636 121#include <netinet/kpi_ipfilter_var.h>
9bccf70c
A
122#include <netinet/udp.h>
123#include <netinet/udp_var.h>
124#include <netinet/bootp.h>
39236c6e
A
125#include <netinet/lro_ext.h>
126
127#if DUMMYNET
128#include <netinet/ip_dummynet.h>
129#endif /* DUMMYNET */
9bccf70c 130
2d21ac55
A
131#if CONFIG_MACF_NET
132#include <security/mac_framework.h>
39236c6e 133#endif /* CONFIG_MACF_NET */
1c79356b 134
1c79356b
A
135#if IPSEC
136#include <netinet6/ipsec.h>
137#include <netkey/key.h>
39236c6e 138#endif /* IPSEC */
1c79356b 139
0a7de745
A
140#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0)
141#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2)
142#define DBG_FNC_IP_INPUT NETDBG_CODE(DBG_NETIP, (2 << 8))
316670eb 143
9bccf70c
A
144#if IPSEC
145extern int ipsec_bypass;
91447636 146extern lck_mtx_t *sadb_mutex;
b0d623f7 147
0a7de745
A
148lck_grp_t *sadb_stat_mutex_grp;
149lck_grp_attr_t *sadb_stat_mutex_grp_attr;
150lck_attr_t *sadb_stat_mutex_attr;
316670eb 151decl_lck_mtx_data(, sadb_stat_mutex_data);
0a7de745 152lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data;
39236c6e 153#endif /* IPSEC */
9bccf70c 154
39236c6e
A
155MBUFQ_HEAD(fq_head);
156
0a7de745 157static int frag_timeout_run; /* frag timer is scheduled to run */
39236c6e
A
158static void frag_timeout(void *);
159static void frag_sched_timeout(void);
160
161static struct ipq *ipq_alloc(int);
162static void ipq_free(struct ipq *);
163static void ipq_updateparams(void);
3e170ce0
A
164static void ip_input_second_pass(struct mbuf *, struct ifnet *,
165 u_int32_t, int, int, struct ip_fw_in_args *, int);
39236c6e
A
166
167decl_lck_mtx_data(static, ipqlock);
0a7de745
A
168static lck_attr_t *ipqlock_attr;
169static lck_grp_t *ipqlock_grp;
170static lck_grp_attr_t *ipqlock_grp_attr;
39236c6e
A
171
172/* Packet reassembly stuff */
0a7de745
A
173#define IPREASS_NHASH_LOG2 6
174#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
175#define IPREASS_HMASK (IPREASS_NHASH - 1)
176#define IPREASS_HASH(x, y) \
39236c6e
A
177 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
178
179/* IP fragment reassembly queues (protected by ipqlock) */
180static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; /* ip reassembly queues */
0a7de745
A
181static int maxnipq; /* max packets in reass queues */
182static u_int32_t maxfragsperpacket; /* max frags/packet in reass queues */
183static u_int32_t nipq; /* # of packets in reass queues */
184static u_int32_t ipq_limit; /* ipq allocation limit */
185static u_int32_t ipq_count; /* current # of allocated ipq's */
1c79356b 186
b0d623f7 187static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS;
39236c6e
A
188static int sysctl_maxnipq SYSCTL_HANDLER_ARGS;
189static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS;
39037602
A
190
191#if (DEBUG || DEVELOPMENT)
3e170ce0
A
192static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS;
193static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS;
194static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS;
39037602 195#endif /* (DEBUG || DEVELOPMENT) */
b0d623f7 196
39236c6e 197int ipforwarding = 0;
b0d623f7 198SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding,
0a7de745
A
199 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0,
200 sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces");
1c79356b 201
39236c6e
A
202static int ipsendredirects = 1; /* XXX */
203SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect,
0a7de745
A
204 CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0,
205 "Enable sending IP redirects");
1c79356b 206
39236c6e 207int ip_defttl = IPDEFTTL;
6d2010ae 208SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 209 &ip_defttl, 0, "Maximum TTL on IP packets");
39236c6e
A
210
211static int ip_dosourceroute = 0;
212SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
0a7de745
A
213 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0,
214 "Enable forwarding source routed IP packets");
39236c6e
A
215
216static int ip_acceptsourceroute = 0;
217SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
0a7de745
A
218 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0,
219 "Enable accepting source routed IP packets");
483a1d10 220
39236c6e
A
221static int ip_sendsourcequench = 0;
222SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench,
0a7de745
A
223 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_sendsourcequench, 0,
224 "Enable the transmission of source quench packets");
91447636 225
39236c6e 226SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
0a7de745
A
227 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, sysctl_maxnipq,
228 "I", "Maximum number of IPv4 fragment reassembly queue entries");
91447636 229
39236c6e 230SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD | CTLFLAG_LOCKED,
0a7de745 231 &nipq, 0, "Current number of IPv4 fragment reassembly queue entries");
39236c6e
A
232
233SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket,
0a7de745
A
234 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0,
235 sysctl_maxfragsperpacket, "I",
236 "Maximum number of IPv4 fragments allowed per packet");
39236c6e 237
39236c6e
A
238static uint32_t ip_adj_clear_hwcksum = 0;
239SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum,
0a7de745
A
240 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0,
241 "Invalidate hwcksum info when adjusting length");
316670eb 242
5ba3f43e
A
243static uint32_t ip_adj_partial_sum = 1;
244SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_partial_sum,
0a7de745
A
245 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_partial_sum, 0,
246 "Perform partial sum adjustment of trailing bytes at IP layer");
5ba3f43e 247
9bccf70c
A
248/*
249 * XXX - Setting ip_checkinterface mostly implements the receive side of
250 * the Strong ES model described in RFC 1122, but since the routing table
251 * and transmit implementation do not implement the Strong ES model,
252 * setting this to 1 results in an odd hybrid.
253 *
254 * XXX - ip_checkinterface currently must be disabled if you use ipnat
255 * to translate the destination address to another local interface.
256 *
257 * XXX - ip_checkinterface must be disabled if you add IP aliases
258 * to the loopback interface instead of the interface where the
259 * packets for those addresses are received.
260 */
39236c6e 261static int ip_checkinterface = 0;
6d2010ae 262SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 263 &ip_checkinterface, 0, "Verify packet arrives on correct interface");
1c79356b 264
3e170ce0
A
265static int ip_chaining = 1;
266SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 267 &ip_chaining, 1, "Do receive side ip address based chaining");
3e170ce0
A
268
269static int ip_chainsz = 6;
270SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 271 &ip_chainsz, 1, "IP receive side max chaining");
3e170ce0 272
39037602 273#if (DEBUG || DEVELOPMENT)
3e170ce0
A
274static int ip_input_measure = 0;
275SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf,
0a7de745
A
276 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
277 &ip_input_measure, 0, sysctl_reset_ip_input_stats, "I", "Do time measurement");
3e170ce0
A
278
279static uint64_t ip_input_measure_bins = 0;
280SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins,
0a7de745
A
281 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_input_measure_bins, 0,
282 sysctl_ip_input_measure_bins, "I",
283 "bins for chaining performance data histogram");
3e170ce0
A
284
285static net_perf_t net_perf;
286SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data,
0a7de745
A
287 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
288 0, 0, sysctl_ip_input_getperf, "S,net_perf",
289 "IP input performance data (struct net_perf, net/net_perf.h)");
39037602 290#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0 291
1c79356b 292#if DIAGNOSTIC
39236c6e 293static int ipprintfs = 0;
1c79356b
A
294#endif
295
1c79356b 296struct protosw *ip_protox[IPPROTO_MAX];
b0d623f7 297
0a7de745
A
298static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr;
299static lck_grp_t *in_ifaddr_rwlock_grp;
300static lck_attr_t *in_ifaddr_rwlock_attr;
316670eb 301decl_lck_rw_data(, in_ifaddr_rwlock_data);
0a7de745 302lck_rw_t *in_ifaddr_rwlock = &in_ifaddr_rwlock_data;
b0d623f7
A
303
304/* Protected by in_ifaddr_rwlock */
0a7de745
A
305struct in_ifaddrhead in_ifaddrhead; /* first inet address */
306struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */
b0d623f7 307
0a7de745
A
308#define INADDR_NHASH 61
309static u_int32_t inaddr_nhash; /* hash table size */
310static u_int32_t inaddr_hashp; /* next largest prime */
b0d623f7 311
39236c6e 312static int ip_getstat SYSCTL_HANDLER_ARGS;
1c79356b 313struct ipstat ipstat;
fe8ab488 314SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats,
0a7de745
A
315 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
316 0, 0, ip_getstat, "S,ipstat",
317 "IP statistics (struct ipstat, netinet/ip_var.h)");
1c79356b
A
318
319#if IPCTL_DEFMTU
6d2010ae 320SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 321 &ip_mtu, 0, "Default MTU");
39236c6e 322#endif /* IPCTL_DEFMTU */
1c79356b 323
9bccf70c 324#if IPSTEALTH
0a7de745 325static int ipstealth = 0;
6d2010ae 326SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 327 &ipstealth, 0, "");
39236c6e 328#endif /* IPSTEALTH */
1c79356b
A
329
330/* Firewall hooks */
4a3eedf9 331#if IPFIREWALL
1c79356b 332ip_fw_chk_t *ip_fw_chk_ptr;
2d21ac55
A
333int fw_enable = 1;
334int fw_bypass = 1;
335int fw_one_pass = 0;
316670eb 336#endif /* IPFIREWALL */
1c79356b
A
337
338#if DUMMYNET
91447636 339ip_dn_io_t *ip_dn_io_ptr;
39236c6e 340#endif /* DUMMYNET */
1c79356b 341
39236c6e 342SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal,
0a7de745 343 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local");
9bccf70c
A
344
345struct ip_linklocal_stat ip_linklocal_stat;
39236c6e 346SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat,
0a7de745
A
347 CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat,
348 "Number of link local packets with TTL less than 255");
9bccf70c 349
39236c6e 350SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in,
0a7de745 351 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input");
9bccf70c 352
91447636 353int ip_linklocal_in_allowbadttl = 1;
39236c6e 354SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl,
0a7de745
A
355 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0,
356 "Allow incoming link local packets with TTL less than 255");
9bccf70c 357
1c79356b 358
1c79356b
A
359/*
360 * We need to save the IP options in case a protocol wants to respond
361 * to an incoming packet over the same route if the packet got here
362 * using IP source routing. This allows connection establishment and
363 * maintenance when the remote end is on a network that is not known
364 * to us.
365 */
0a7de745
A
366static int ip_nhops = 0;
367static struct ip_srcrt {
368 struct in_addr dst; /* final destination */
369 char nop; /* one NOP to align */
370 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
371 struct in_addr route[MAX_IPOPTLEN / sizeof(struct in_addr)];
1c79356b
A
372} ip_srcrt;
373
39236c6e
A
374static void in_ifaddrhashtbl_init(void);
375static void save_rte(u_char *, struct in_addr);
376static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *);
377static void ip_forward(struct mbuf *, int, struct sockaddr_in *);
378static void frag_freef(struct ipqhead *, struct ipq *);
9bccf70c
A
379#if IPDIVERT
380#ifdef IPDIVERT_44
39236c6e
A
381static struct mbuf *ip_reass(struct mbuf *, u_int32_t *, u_int16_t *);
382#else /* !IPDIVERT_44 */
383static struct mbuf *ip_reass(struct mbuf *, u_int16_t *, u_int16_t *);
384#endif /* !IPDIVERT_44 */
385#else /* !IPDIVERT */
386static struct mbuf *ip_reass(struct mbuf *);
387#endif /* !IPDIVERT */
b0d623f7
A
388static void ip_fwd_route_copyout(struct ifnet *, struct route *);
389static void ip_fwd_route_copyin(struct ifnet *, struct route *);
316670eb 390static inline u_short ip_cksum(struct mbuf *, int);
1c79356b 391
39236c6e 392int ip_use_randomid = 1;
6d2010ae 393SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 394 &ip_use_randomid, 0, "Randomize IP packets IDs");
1c79356b 395
316670eb
A
396/*
397 * On platforms which require strict alignment (currently for anything but
398 * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not,
399 * copy the contents of the mbuf chain into a new chain, and free the original
400 * one. Create some head room in the first mbuf of the new chain, in case
401 * it's needed later on.
402 */
403#if defined(__i386__) || defined(__x86_64__)
0a7de745 404#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0)
316670eb 405#else /* !__i386__ && !__x86_64__ */
0a7de745
A
406#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \
407 if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \
408 struct mbuf *_n; \
409 struct ifnet *__ifp = (_ifp); \
410 atomic_add_64(&(__ifp)->if_alignerrs, 1); \
411 if (((_m)->m_flags & M_PKTHDR) && \
412 (_m)->m_pkthdr.pkt_hdr != NULL) \
413 (_m)->m_pkthdr.pkt_hdr = NULL; \
414 _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \
415 if (_n == NULL) { \
416 atomic_add_32(&ipstat.ips_toosmall, 1); \
417 m_freem(_m); \
418 (_m) = NULL; \
419 _action; \
420 } else { \
421 VERIFY(_n != (_m)); \
422 (_m) = _n; \
423 } \
424 } \
316670eb
A
425} while (0)
426#endif /* !__i386__ && !__x86_64__ */
55e303ae 427
39236c6e
A
428/*
429 * GRE input handler function, settable via ip_gre_register_input() for PPTP.
430 */
431static gre_input_func_t gre_input_func;
432
39037602
A
433static void
434ip_init_delayed(void)
435{
436 struct ifreq ifr;
437 int error;
438 struct sockaddr_in *sin;
439
440 bzero(&ifr, sizeof(ifr));
441 strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name));
442 sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr;
443 sin->sin_len = sizeof(struct sockaddr_in);
444 sin->sin_family = AF_INET;
445 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
446 error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc);
0a7de745 447 if (error) {
39037602
A
448 printf("%s: failed to initialise lo0's address, error=%d\n",
449 __func__, error);
0a7de745 450 }
39037602
A
451}
452
1c79356b
A
453/*
454 * IP initialization: fill in IP protocol switch table.
455 * All protocols not implemented in kernel go to raw IP protocol handler.
456 */
457void
39236c6e 458ip_init(struct protosw *pp, struct domain *dp)
1c79356b 459{
39236c6e 460 static int ip_initialized = 0;
2d21ac55 461 struct protosw *pr;
39236c6e 462 struct timeval tv;
2d21ac55 463 int i;
91447636 464
39236c6e 465 domain_proto_mtx_lock_assert_held();
0a7de745 466 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
1c79356b 467
39236c6e 468 /* ipq_alloc() uses mbufs for IP fragment queue structures */
0a7de745 469 _CASSERT(sizeof(struct ipq) <= _MLEN);
1c79356b 470
39236c6e
A
471 /*
472 * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is
473 * interchangeable with in_aliasreq; they must have the same size.
474 */
0a7de745 475 _CASSERT(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq));
91447636 476
0a7de745 477 if (ip_initialized) {
39236c6e 478 return;
0a7de745 479 }
39236c6e 480 ip_initialized = 1;
91447636 481
39236c6e 482 in_ifaddr_init();
91447636 483
39236c6e
A
484 in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init();
485 in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock",
486 in_ifaddr_rwlock_grp_attr);
487 in_ifaddr_rwlock_attr = lck_attr_alloc_init();
488 lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp,
489 in_ifaddr_rwlock_attr);
91447636 490
39236c6e
A
491 TAILQ_INIT(&in_ifaddrhead);
492 in_ifaddrhashtbl_init();
493
494 ip_moptions_init();
495
496 pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW);
497 if (pr == NULL) {
498 panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]\n",
499 __func__);
500 /* NOTREACHED */
501 }
502
3e170ce0 503 /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
0a7de745 504 for (i = 0; i < IPPROTO_MAX; i++) {
3e170ce0 505 ip_protox[i] = pr;
0a7de745 506 }
3e170ce0
A
507 /*
508 * Cycle through IP protocols and put them into the appropriate place
509 * in ip_protox[], skipping protocols IPPROTO_{IP,RAW}.
510 */
511 VERIFY(dp == inetdomain && dp->dom_family == PF_INET);
512 TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) {
513 VERIFY(pr->pr_domain == dp);
514 if (pr->pr_protocol != 0 && pr->pr_protocol != IPPROTO_RAW) {
515 /* Be careful to only index valid IP protocols. */
0a7de745 516 if (pr->pr_protocol < IPPROTO_MAX) {
3e170ce0 517 ip_protox[pr->pr_protocol] = pr;
0a7de745 518 }
3e170ce0
A
519 }
520 }
521
522 /* IP fragment reassembly queue lock */
523 ipqlock_grp_attr = lck_grp_attr_alloc_init();
524 ipqlock_grp = lck_grp_alloc_init("ipqlock", ipqlock_grp_attr);
525 ipqlock_attr = lck_attr_alloc_init();
526 lck_mtx_init(&ipqlock, ipqlock_grp, ipqlock_attr);
527
528 lck_mtx_lock(&ipqlock);
529 /* Initialize IP reassembly queue. */
0a7de745 530 for (i = 0; i < IPREASS_NHASH; i++) {
3e170ce0 531 TAILQ_INIT(&ipq[i]);
0a7de745 532 }
3e170ce0
A
533
534 maxnipq = nmbclusters / 32;
535 maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */
536 ipq_updateparams();
537 lck_mtx_unlock(&ipqlock);
538
539 getmicrotime(&tv);
540 ip_id = RandomULong() ^ tv.tv_usec;
541 ip_initid();
542
543 ipf_init();
544
545#if IPSEC
546 sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init();
547 sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat",
548 sadb_stat_mutex_grp_attr);
549 sadb_stat_mutex_attr = lck_attr_alloc_init();
550 lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp,
551 sadb_stat_mutex_attr);
552
553#endif
554 arp_init();
39037602 555 net_init_add(ip_init_delayed);
3e170ce0
A
556}
557
558/*
559 * Initialize IPv4 source address hash table.
560 */
561static void
562in_ifaddrhashtbl_init(void)
563{
564 int i, k, p;
565
0a7de745 566 if (in_ifaddrhashtbl != NULL) {
3e170ce0 567 return;
0a7de745 568 }
3e170ce0
A
569
570 PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash,
0a7de745
A
571 sizeof(inaddr_nhash));
572 if (inaddr_nhash == 0) {
3e170ce0 573 inaddr_nhash = INADDR_NHASH;
0a7de745 574 }
3e170ce0
A
575
576 MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *,
0a7de745 577 inaddr_nhash * sizeof(*in_ifaddrhashtbl),
3e170ce0 578 M_IFADDR, M_WAITOK | M_ZERO);
0a7de745 579 if (in_ifaddrhashtbl == NULL) {
3e170ce0 580 panic("in_ifaddrhashtbl_init allocation failed");
0a7de745 581 }
3e170ce0
A
582
583 /*
584 * Generate the next largest prime greater than inaddr_nhash.
585 */
586 k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2;
587 for (;;) {
588 p = 1;
589 for (i = 3; i * i <= k; i += 2) {
0a7de745 590 if (k % i == 0) {
3e170ce0 591 p = 0;
0a7de745 592 }
3e170ce0 593 }
0a7de745 594 if (p == 1) {
3e170ce0 595 break;
0a7de745 596 }
3e170ce0
A
597 k += 2;
598 }
599 inaddr_hashp = k;
600}
601
602u_int32_t
603inaddr_hashval(u_int32_t key)
604{
605 /*
606 * The hash index is the computed prime times the key modulo
607 * the hash size, as documented in "Introduction to Algorithms"
608 * (Cormen, Leiserson, Rivest).
609 */
0a7de745
A
610 if (inaddr_nhash > 1) {
611 return (key * inaddr_hashp) % inaddr_nhash;
612 } else {
613 return 0;
614 }
3e170ce0
A
615}
616
617void
618ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto)
619{
620 ip_proto_dispatch_in(m, hlen, proto, 0);
621}
622
623__private_extern__ void
624ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto,
625 ipfilter_t inject_ipfref)
626{
627 struct ipfilter *filter;
628 int seen = (inject_ipfref == NULL);
0a7de745 629 int changed_header = 0;
3e170ce0
A
630 struct ip *ip;
631 void (*pr_input)(struct mbuf *, int len);
632
633 if (!TAILQ_EMPTY(&ipv4_filters)) {
634 ipf_ref();
635 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
636 if (seen == 0) {
0a7de745 637 if ((struct ipfilter *)inject_ipfref == filter) {
3e170ce0 638 seen = 1;
0a7de745 639 }
3e170ce0
A
640 } else if (filter->ipf_filter.ipf_input) {
641 errno_t result;
642
643 if (changed_header == 0) {
644 /*
645 * Perform IP header alignment fixup,
646 * if needed, before passing packet
647 * into filter(s).
648 */
649 IP_HDR_ALIGNMENT_FIXUP(m,
650 m->m_pkthdr.rcvif, ipf_unref());
651
652 /* ipf_unref() already called */
0a7de745 653 if (m == NULL) {
3e170ce0 654 return;
0a7de745 655 }
3e170ce0
A
656
657 changed_header = 1;
658 ip = mtod(m, struct ip *);
659 ip->ip_len = htons(ip->ip_len + hlen);
660 ip->ip_off = htons(ip->ip_off);
661 ip->ip_sum = 0;
662 ip->ip_sum = ip_cksum_hdr_in(m, hlen);
663 }
664 result = filter->ipf_filter.ipf_input(
0a7de745
A
665 filter->ipf_filter.cookie, (mbuf_t *)&m,
666 hlen, proto);
3e170ce0
A
667 if (result == EJUSTRETURN) {
668 ipf_unref();
669 return;
670 }
671 if (result != 0) {
672 ipf_unref();
673 m_freem(m);
674 return;
675 }
676 }
677 }
678 ipf_unref();
679 }
680
681 /* Perform IP header alignment fixup (post-filters), if needed */
0a7de745 682 IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return );
3e170ce0
A
683
684 /*
685 * If there isn't a specific lock for the protocol
686 * we're about to call, use the generic lock for AF_INET.
687 * otherwise let the protocol deal with its own locking
688 */
689 ip = mtod(m, struct ip *);
690
691 if (changed_header) {
692 ip->ip_len = ntohs(ip->ip_len) - hlen;
693 ip->ip_off = ntohs(ip->ip_off);
694 }
695
696 if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) {
697 m_freem(m);
698 } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) {
699 lck_mtx_lock(inet_domain_mutex);
700 pr_input(m, hlen);
701 lck_mtx_unlock(inet_domain_mutex);
702 } else {
703 pr_input(m, hlen);
704 }
705}
706
707struct pktchain_elm {
0a7de745
A
708 struct mbuf *pkte_head;
709 struct mbuf *pkte_tail;
710 struct in_addr pkte_saddr;
711 struct in_addr pkte_daddr;
712 uint16_t pkte_npkts;
713 uint16_t pkte_proto;
714 uint32_t pkte_nbytes;
3e170ce0
A
715};
716
717typedef struct pktchain_elm pktchain_elm_t;
718
719/* Store upto PKTTBL_SZ unique flows on the stack */
0a7de745 720#define PKTTBL_SZ 7
3e170ce0
A
721
722static struct mbuf *
723ip_chain_insert(struct mbuf *packet, pktchain_elm_t *tbl)
724{
0a7de745
A
725 struct ip* ip;
726 int pkttbl_idx = 0;
3e170ce0
A
727
728 ip = mtod(packet, struct ip*);
729
730 /* reusing the hash function from inaddr_hashval */
731 pkttbl_idx = inaddr_hashval(ntohs(ip->ip_src.s_addr)) % PKTTBL_SZ;
732 if (tbl[pkttbl_idx].pkte_head == NULL) {
733 tbl[pkttbl_idx].pkte_head = packet;
734 tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr;
735 tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr;
736 tbl[pkttbl_idx].pkte_proto = ip->ip_p;
737 } else {
738 if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) &&
739 (ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) &&
740 (ip->ip_p == tbl[pkttbl_idx].pkte_proto)) {
741 } else {
0a7de745 742 return packet;
3e170ce0
A
743 }
744 }
0a7de745 745 if (tbl[pkttbl_idx].pkte_tail != NULL) {
3e170ce0 746 mbuf_setnextpkt(tbl[pkttbl_idx].pkte_tail, packet);
0a7de745 747 }
3e170ce0
A
748
749 tbl[pkttbl_idx].pkte_tail = packet;
750 tbl[pkttbl_idx].pkte_npkts += 1;
751 tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len;
0a7de745 752 return NULL;
3e170ce0
A
753}
754
755/* args is a dummy variable here for backward compatibility */
756static void
757ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args)
758{
759 int i = 0;
760
761 for (i = 0; i < PKTTBL_SZ; i++) {
762 if (tbl[i].pkte_head != NULL) {
763 struct mbuf *m = tbl[i].pkte_head;
764 ip_input_second_pass(m, m->m_pkthdr.rcvif, 0,
765 tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args, 0);
766
0a7de745 767 if (tbl[i].pkte_npkts > 2) {
3e170ce0 768 ipstat.ips_rxc_chainsz_gt2++;
0a7de745
A
769 }
770 if (tbl[i].pkte_npkts > 4) {
3e170ce0 771 ipstat.ips_rxc_chainsz_gt4++;
0a7de745 772 }
39037602 773#if (DEBUG || DEVELOPMENT)
0a7de745 774 if (ip_input_measure) {
3e170ce0 775 net_perf_histogram(&net_perf, tbl[i].pkte_npkts);
0a7de745 776 }
39037602 777#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
778 tbl[i].pkte_head = tbl[i].pkte_tail = NULL;
779 tbl[i].pkte_npkts = 0;
780 tbl[i].pkte_nbytes = 0;
781 /* no need to initialize address and protocol in tbl */
782 }
783 }
784}
785
786static void
787ip_input_cpout_args(struct ip_fw_in_args *args, struct ip_fw_args *args1,
788 boolean_t *done_init)
789{
790 if (*done_init == FALSE) {
791 bzero(args1, sizeof(struct ip_fw_args));
792 *done_init = TRUE;
793 }
794 args1->fwa_next_hop = args->fwai_next_hop;
795 args1->fwa_ipfw_rule = args->fwai_ipfw_rule;
796 args1->fwa_pf_rule = args->fwai_pf_rule;
797 args1->fwa_divert_rule = args->fwai_divert_rule;
798}
799
800static void
801ip_input_cpin_args(struct ip_fw_args *args1, struct ip_fw_in_args *args)
802{
803 args->fwai_next_hop = args1->fwa_next_hop;
804 args->fwai_ipfw_rule = args1->fwa_ipfw_rule;
805 args->fwai_pf_rule = args1->fwa_pf_rule;
806 args->fwai_divert_rule = args1->fwa_divert_rule;
807}
808
809typedef enum {
810 IPINPUT_DOCHAIN = 0,
811 IPINPUT_DONTCHAIN,
812 IPINPUT_FREED,
813 IPINPUT_DONE
814} ipinput_chain_ret_t;
815
816static void
817ip_input_update_nstat(struct ifnet *ifp, struct in_addr src_ip,
818 u_int32_t packets, u_int32_t bytes)
819{
820 if (nstat_collect) {
821 struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp,
822 src_ip);
823 if (rt != NULL) {
824 nstat_route_rx(rt, packets, bytes, 0);
825 rtfree(rt);
826 }
827 }
828}
829
830static void
831ip_input_dispatch_chain(struct mbuf *m)
832{
833 struct mbuf *tmp_mbuf = m;
834 struct mbuf *nxt_mbuf = NULL;
835 struct ip *ip = NULL;
836 unsigned int hlen;
837
838 ip = mtod(tmp_mbuf, struct ip *);
839 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
0a7de745 840 while (tmp_mbuf) {
3e170ce0
A
841 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
842 mbuf_setnextpkt(tmp_mbuf, NULL);
843
0a7de745 844 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) {
3e170ce0 845 tmp_mbuf = tcp_lro(tmp_mbuf, hlen);
0a7de745
A
846 }
847 if (tmp_mbuf) {
3e170ce0 848 ip_proto_dispatch_in(tmp_mbuf, hlen, ip->ip_p, 0);
0a7de745 849 }
3e170ce0
A
850 tmp_mbuf = nxt_mbuf;
851 if (tmp_mbuf) {
852 ip = mtod(tmp_mbuf, struct ip *);
853 /* first mbuf of chain already has adjusted ip_len */
854 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
855 ip->ip_len -= hlen;
856 }
857 }
858}
859
860static void
861ip_input_setdst_chain(struct mbuf *m, uint32_t ifindex, struct in_ifaddr *ia)
862{
863 struct mbuf *tmp_mbuf = m;
864
865 while (tmp_mbuf) {
866 ip_setdstifaddr_info(tmp_mbuf, ifindex, ia);
867 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
868 }
869}
870
5ba3f43e
A
871static void
872ip_input_adjust(struct mbuf *m, struct ip *ip, struct ifnet *inifp)
873{
874 boolean_t adjust = TRUE;
875
876 ASSERT(m_pktlen(m) > ip->ip_len);
877
878 /*
879 * Invalidate hardware checksum info if ip_adj_clear_hwcksum
880 * is set; useful to handle buggy drivers. Note that this
881 * should not be enabled by default, as we may get here due
882 * to link-layer padding.
883 */
884 if (ip_adj_clear_hwcksum &&
885 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
886 !(inifp->if_flags & IFF_LOOPBACK) &&
887 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
888 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
889 m->m_pkthdr.csum_data = 0;
890 ipstat.ips_adj_hwcsum_clr++;
891 }
892
893 /*
894 * If partial checksum information is available, subtract
895 * out the partial sum of postpended extraneous bytes, and
896 * update the checksum metadata accordingly. By doing it
897 * here, the upper layer transport only needs to adjust any
898 * prepended extraneous bytes (else it will do both.)
899 */
900 if (ip_adj_partial_sum &&
0a7de745
A
901 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
902 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
903 m->m_pkthdr.csum_rx_val = m_adj_sum16(m,
904 m->m_pkthdr.csum_rx_start, m->m_pkthdr.csum_rx_start,
905 (ip->ip_len - m->m_pkthdr.csum_rx_start),
906 m->m_pkthdr.csum_rx_val);
907 } else if ((m->m_pkthdr.csum_flags &
0a7de745
A
908 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
909 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
910 /*
911 * If packet has partial checksum info and we decided not
912 * to subtract the partial sum of postpended extraneous
913 * bytes here (not the default case), leave that work to
914 * be handled by the other layers. For now, only TCP, UDP
915 * layers are capable of dealing with this. For all other
916 * protocols (including fragments), trim and ditch the
917 * partial sum as those layers might not implement partial
918 * checksumming (or adjustment) at all.
919 */
920 if ((ip->ip_off & (IP_MF | IP_OFFMASK)) == 0 &&
921 (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_UDP)) {
922 adjust = FALSE;
923 } else {
924 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
925 m->m_pkthdr.csum_data = 0;
926 ipstat.ips_adj_hwcsum_clr++;
927 }
928 }
929
930 if (adjust) {
931 ipstat.ips_adj++;
932 if (m->m_len == m->m_pkthdr.len) {
933 m->m_len = ip->ip_len;
934 m->m_pkthdr.len = ip->ip_len;
935 } else {
936 m_adj(m, ip->ip_len - m->m_pkthdr.len);
937 }
938 }
939}
940
3e170ce0
A
941/*
942 * First pass does all essential packet validation and places on a per flow
943 * queue for doing operations that have same outcome for all packets of a flow.
944 * div_info is packet divert/tee info
945 */
946static ipinput_chain_ret_t
947ip_input_first_pass(struct mbuf *m, u_int32_t *div_info,
948 struct ip_fw_in_args *args, int *ours, struct mbuf **modm)
949{
0a7de745
A
950 struct ip *ip;
951 struct ifnet *inifp;
952 unsigned int hlen;
953 int retval = IPINPUT_DOCHAIN;
954 int len = 0;
955 struct in_addr src_ip;
3e170ce0 956#if IPFIREWALL
0a7de745 957 int i;
3e170ce0
A
958#endif
959#if IPFIREWALL || DUMMYNET
0a7de745
A
960 struct m_tag *copy;
961 struct m_tag *p;
962 boolean_t delete = FALSE;
963 struct ip_fw_args args1;
964 boolean_t init = FALSE;
3e170ce0
A
965#endif
966 ipfilter_t inject_filter_ref = NULL;
967
968#if !IPFIREWALL
969#pragma unused (args)
970#endif
971
972#if !IPDIVERT
973#pragma unused (div_info)
974#pragma unused (ours)
975#endif
976
977#if !IPFIREWALL_FORWARD
978#pragma unused (ours)
979#endif
980
981 /* Check if the mbuf is still valid after interface filter processing */
982 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
983 inifp = mbuf_pkthdr_rcvif(m);
984 VERIFY(inifp != NULL);
985
986 /* Perform IP header alignment fixup, if needed */
987 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
988
989 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
990
991#if IPFIREWALL || DUMMYNET
992
993 /*
994 * Don't bother searching for tag(s) if there's none.
995 */
0a7de745 996 if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
3e170ce0 997 goto ipfw_tags_done;
0a7de745 998 }
3e170ce0
A
999
1000 /* Grab info from mtags prepended to the chain */
1001 p = m_tag_first(m);
1002 while (p) {
1003 if (p->m_tag_id == KERNEL_MODULE_TAG_ID) {
1004#if DUMMYNET
1005 if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) {
1006 struct dn_pkt_tag *dn_tag;
1007
0a7de745 1008 dn_tag = (struct dn_pkt_tag *)(p + 1);
3e170ce0
A
1009 args->fwai_ipfw_rule = dn_tag->dn_ipfw_rule;
1010 args->fwai_pf_rule = dn_tag->dn_pf_rule;
1011 delete = TRUE;
1012 }
1013#endif
1014
1015#if IPDIVERT
1016 if (p->m_tag_type == KERNEL_TAG_TYPE_DIVERT) {
1017 struct divert_tag *div_tag;
1018
0a7de745 1019 div_tag = (struct divert_tag *)(p + 1);
3e170ce0
A
1020 args->fwai_divert_rule = div_tag->cookie;
1021 delete = TRUE;
1022 }
1023#endif
1024
1025 if (p->m_tag_type == KERNEL_TAG_TYPE_IPFORWARD) {
1026 struct ip_fwd_tag *ipfwd_tag;
1027
0a7de745 1028 ipfwd_tag = (struct ip_fwd_tag *)(p + 1);
3e170ce0
A
1029 args->fwai_next_hop = ipfwd_tag->next_hop;
1030 delete = TRUE;
1031 }
1032
1033 if (delete) {
1034 copy = p;
1035 p = m_tag_next(m, p);
1036 m_tag_delete(m, copy);
0a7de745 1037 } else {
3e170ce0
A
1038 p = m_tag_next(m, p);
1039 }
1040 } else {
1041 p = m_tag_next(m, p);
1042 }
1043 }
1044
1045#if DIAGNOSTIC
0a7de745 1046 if (m == NULL || !(m->m_flags & M_PKTHDR)) {
3e170ce0 1047 panic("ip_input no HDR");
0a7de745 1048 }
3e170ce0
A
1049#endif
1050
1051#if DUMMYNET
1052 if (args->fwai_ipfw_rule || args->fwai_pf_rule) {
1053 /* dummynet already filtered us */
1054 ip = mtod(m, struct ip *);
1055 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1056 inject_filter_ref = ipf_get_inject_filter(m);
1057#if IPFIREWALL
0a7de745 1058 if (args->fwai_ipfw_rule) {
3e170ce0 1059 goto iphack;
0a7de745 1060 }
3e170ce0 1061#endif /* IPFIREWALL */
0a7de745 1062 if (args->fwai_pf_rule) {
3e170ce0 1063 goto check_with_pf;
0a7de745 1064 }
3e170ce0
A
1065 }
1066#endif /* DUMMYNET */
1067ipfw_tags_done:
1068#endif /* IPFIREWALL || DUMMYNET */
1069
1070 /*
1071 * No need to process packet twice if we've already seen it.
1072 */
0a7de745 1073 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
3e170ce0 1074 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1075 }
3e170ce0
A
1076 if (inject_filter_ref != NULL) {
1077 ip = mtod(m, struct ip *);
1078 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1079
1080 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1081 struct ip *, ip, struct ifnet *, inifp,
1082 struct ip *, ip, struct ip6_hdr *, NULL);
1083
1084 ip->ip_len = ntohs(ip->ip_len) - hlen;
1085 ip->ip_off = ntohs(ip->ip_off);
1086 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
0a7de745 1087 return IPINPUT_DONE;
3e170ce0
A
1088 }
1089
0a7de745 1090 if (m->m_pkthdr.len < sizeof(struct ip)) {
3e170ce0
A
1091 OSAddAtomic(1, &ipstat.ips_total);
1092 OSAddAtomic(1, &ipstat.ips_tooshort);
1093 m_freem(m);
0a7de745 1094 return IPINPUT_FREED;
3e170ce0
A
1095 }
1096
0a7de745
A
1097 if (m->m_len < sizeof(struct ip) &&
1098 (m = m_pullup(m, sizeof(struct ip))) == NULL) {
3e170ce0
A
1099 OSAddAtomic(1, &ipstat.ips_total);
1100 OSAddAtomic(1, &ipstat.ips_toosmall);
0a7de745 1101 return IPINPUT_FREED;
3e170ce0
A
1102 }
1103
1104 ip = mtod(m, struct ip *);
1105 *modm = m;
1106
1107 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1108 ip->ip_p, ip->ip_off, ip->ip_len);
1109
1110 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1111 OSAddAtomic(1, &ipstat.ips_total);
1112 OSAddAtomic(1, &ipstat.ips_badvers);
1113 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1114 m_freem(m);
0a7de745 1115 return IPINPUT_FREED;
3e170ce0
A
1116 }
1117
1118 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
0a7de745 1119 if (hlen < sizeof(struct ip)) {
3e170ce0
A
1120 OSAddAtomic(1, &ipstat.ips_total);
1121 OSAddAtomic(1, &ipstat.ips_badhlen);
1122 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1123 m_freem(m);
0a7de745 1124 return IPINPUT_FREED;
3e170ce0
A
1125 }
1126
1127 if (hlen > m->m_len) {
1128 if ((m = m_pullup(m, hlen)) == NULL) {
1129 OSAddAtomic(1, &ipstat.ips_total);
1130 OSAddAtomic(1, &ipstat.ips_badhlen);
1131 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
0a7de745 1132 return IPINPUT_FREED;
3e170ce0
A
1133 }
1134 ip = mtod(m, struct ip *);
1135 *modm = m;
1136 }
1137
1138 /* 127/8 must not appear on wire - RFC1122 */
1139 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1140 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1141 /*
1142 * Allow for the following exceptions:
1143 *
1144 * 1. If the packet was sent to loopback (i.e. rcvif
1145 * would have been set earlier at output time.)
1146 *
1147 * 2. If the packet was sent out on loopback from a local
1148 * source address which belongs to a non-loopback
1149 * interface (i.e. rcvif may not necessarily be a
1150 * loopback interface, hence the test for PKTF_LOOP.)
1151 * Unlike IPv6, there is no interface scope ID, and
1152 * therefore we don't care so much about PKTF_IFINFO.
1153 */
1154 if (!(inifp->if_flags & IFF_LOOPBACK) &&
0a7de745 1155 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
3e170ce0
A
1156 OSAddAtomic(1, &ipstat.ips_total);
1157 OSAddAtomic(1, &ipstat.ips_badaddr);
1158 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1159 m_freem(m);
0a7de745 1160 return IPINPUT_FREED;
3e170ce0
A
1161 }
1162 }
1163
1164 /* IPv4 Link-Local Addresses as defined in RFC3927 */
1165 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
1166 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1167 ip_linklocal_stat.iplls_in_total++;
1168 if (ip->ip_ttl != MAXTTL) {
1169 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
1170 /* Silently drop link local traffic with bad TTL */
1171 if (!ip_linklocal_in_allowbadttl) {
1172 OSAddAtomic(1, &ipstat.ips_total);
1173 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1174 m_freem(m);
0a7de745 1175 return IPINPUT_FREED;
3e170ce0
A
1176 }
1177 }
1178 }
1179
1180 if (ip_cksum(m, hlen)) {
1181 OSAddAtomic(1, &ipstat.ips_total);
1182 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1183 m_freem(m);
0a7de745 1184 return IPINPUT_FREED;
3e170ce0
A
1185 }
1186
1187 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1188 struct ip *, ip, struct ifnet *, inifp,
1189 struct ip *, ip, struct ip6_hdr *, NULL);
1190
1191 /*
1192 * Convert fields to host representation.
1193 */
1194#if BYTE_ORDER != BIG_ENDIAN
1195 NTOHS(ip->ip_len);
1196#endif
1197
1198 if (ip->ip_len < hlen) {
1199 OSAddAtomic(1, &ipstat.ips_total);
1200 OSAddAtomic(1, &ipstat.ips_badlen);
1201 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1202 m_freem(m);
0a7de745 1203 return IPINPUT_FREED;
3e170ce0
A
1204 }
1205
1206#if BYTE_ORDER != BIG_ENDIAN
1207 NTOHS(ip->ip_off);
1208#endif
1209
1210 /*
1211 * Check that the amount of data in the buffers
1212 * is as at least much as the IP header would have us expect.
1213 * Trim mbufs if longer than we expect.
1214 * Drop packet if shorter than we expect.
1215 */
1216 if (m->m_pkthdr.len < ip->ip_len) {
1217 OSAddAtomic(1, &ipstat.ips_total);
1218 OSAddAtomic(1, &ipstat.ips_tooshort);
1219 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1220 m_freem(m);
0a7de745 1221 return IPINPUT_FREED;
3e170ce0
A
1222 }
1223
1224 if (m->m_pkthdr.len > ip->ip_len) {
5ba3f43e 1225 ip_input_adjust(m, ip, inifp);
3e170ce0
A
1226 }
1227
3e170ce0
A
1228 /* for netstat route statistics */
1229 src_ip = ip->ip_src;
1230 len = m->m_pkthdr.len;
1231
1232#if DUMMYNET
1233check_with_pf:
1234#endif
1235#if PF
1236 /* Invoke inbound packet filter */
1237 if (PF_IS_ENABLED) {
1238 int error;
1239 ip_input_cpout_args(args, &args1, &init);
5ba3f43e
A
1240 ip = mtod(m, struct ip *);
1241 src_ip = ip->ip_src;
3e170ce0
A
1242
1243#if DUMMYNET
1244 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1);
1245#else
1246 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1247#endif /* DUMMYNET */
1248 if (error != 0 || m == NULL) {
1249 if (m != NULL) {
1250 panic("%s: unexpected packet %p\n",
1251 __func__, m);
1252 /* NOTREACHED */
1253 }
1254 /* Already freed by callee */
1255 ip_input_update_nstat(inifp, src_ip, 1, len);
1256 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1257 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1258 return IPINPUT_FREED;
3e170ce0
A
1259 }
1260 ip = mtod(m, struct ip *);
1261 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1262 *modm = m;
1263 ip_input_cpin_args(&args1, args);
1264 }
1265#endif /* PF */
1266
1267#if IPSEC
1268 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
1269 retval = IPINPUT_DONTCHAIN; /* XXX scope for chaining here? */
1270 goto pass;
1271 }
1272#endif
1273
1274#if IPFIREWALL
1275#if DUMMYNET
1276iphack:
1277#endif /* DUMMYNET */
1278 /*
1279 * Check if we want to allow this packet to be processed.
1280 * Consider it to be bad if not.
1281 */
1282 if (fw_enable && IPFW_LOADED) {
1283#if IPFIREWALL_FORWARD
1284 /*
1285 * If we've been forwarded from the output side, then
1286 * skip the firewall a second time
1287 */
1288 if (args->fwai_next_hop) {
1289 *ours = 1;
0a7de745 1290 return IPINPUT_DONTCHAIN;
3e170ce0 1291 }
0a7de745 1292#endif /* IPFIREWALL_FORWARD */
3e170ce0
A
1293 ip_input_cpout_args(args, &args1, &init);
1294 args1.fwa_m = m;
1295
1296 i = ip_fw_chk_ptr(&args1);
1297 m = args1.fwa_m;
1298
1299 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
0a7de745 1300 if (m) {
3e170ce0 1301 m_freem(m);
0a7de745 1302 }
3e170ce0
A
1303 ip_input_update_nstat(inifp, src_ip, 1, len);
1304 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1305 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1306 return IPINPUT_FREED;
3e170ce0
A
1307 }
1308 ip = mtod(m, struct ip *); /* just in case m changed */
1309 *modm = m;
1310 ip_input_cpin_args(&args1, args);
1311
1312 if (i == 0 && args->fwai_next_hop == NULL) { /* common case */
1313 goto pass;
1314 }
1315#if DUMMYNET
1316 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
1317 /* Send packet to the appropriate pipe */
0a7de745 1318 ip_dn_io_ptr(m, i & 0xffff, DN_TO_IP_IN, &args1,
3e170ce0
A
1319 DN_CLIENT_IPFW);
1320 ip_input_update_nstat(inifp, src_ip, 1, len);
1321 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1322 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1323 return IPINPUT_FREED;
3e170ce0
A
1324 }
1325#endif /* DUMMYNET */
1326#if IPDIVERT
1327 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
1328 /* Divert or tee packet */
1329 *div_info = i;
1330 *ours = 1;
0a7de745 1331 return IPINPUT_DONTCHAIN;
3e170ce0
A
1332 }
1333#endif
1334#if IPFIREWALL_FORWARD
1335 if (i == 0 && args->fwai_next_hop != NULL) {
1336 retval = IPINPUT_DONTCHAIN;
1337 goto pass;
1338 }
1339#endif
1340 /*
1341 * if we get here, the packet must be dropped
1342 */
1343 ip_input_update_nstat(inifp, src_ip, 1, len);
1344 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1345 m_freem(m);
1346 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1347 return IPINPUT_FREED;
3e170ce0
A
1348 }
1349#endif /* IPFIREWALL */
1350#if IPSEC | IPFIREWALL
1351pass:
1352#endif
1353 /*
1354 * Process options and, if not destined for us,
1355 * ship it on. ip_dooptions returns 1 when an
1356 * error was detected (causing an icmp message
1357 * to be sent and the original packet to be freed).
1358 */
0a7de745 1359 ip_nhops = 0; /* for source routed packets */
3e170ce0 1360#if IPFIREWALL
0a7de745 1361 if (hlen > sizeof(struct ip) &&
3e170ce0
A
1362 ip_dooptions(m, 0, args->fwai_next_hop)) {
1363#else /* !IPFIREWALL */
0a7de745 1364 if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
3e170ce0
A
1365#endif /* !IPFIREWALL */
1366 ip_input_update_nstat(inifp, src_ip, 1, len);
1367 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1368 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1369 return IPINPUT_FREED;
3e170ce0
A
1370 }
1371
1372 /*
1373 * Don't chain fragmented packets as the process of determining
1374 * if it is our fragment or someone else's plus the complexity of
1375 * divert and fw args makes it harder to do chaining.
1376 */
0a7de745
A
1377 if (ip->ip_off & ~(IP_DF | IP_RF)) {
1378 return IPINPUT_DONTCHAIN;
1379 }
3e170ce0
A
1380
1381 /* Allow DHCP/BootP responses through */
1382 if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
0a7de745 1383 hlen == sizeof(struct ip) && ip->ip_p == IPPROTO_UDP) {
3e170ce0
A
1384 struct udpiphdr *ui;
1385
0a7de745
A
1386 if (m->m_len < sizeof(struct udpiphdr) &&
1387 (m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
3e170ce0
A
1388 OSAddAtomic(1, &udpstat.udps_hdrops);
1389 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1390 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1391 return IPINPUT_FREED;
3e170ce0
A
1392 }
1393 *modm = m;
1394 ui = mtod(m, struct udpiphdr *);
1395 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1396 ip_setdstifaddr_info(m, inifp->if_index, NULL);
0a7de745 1397 return IPINPUT_DONTCHAIN;
3e170ce0
A
1398 }
1399 }
1400
1401 /* Avoid chaining raw sockets as ipsec checks occur later for them */
0a7de745
A
1402 if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) {
1403 return IPINPUT_DONTCHAIN;
1404 }
3e170ce0 1405
0a7de745 1406 return retval;
3e170ce0
A
1407#if !defined(__i386__) && !defined(__x86_64__)
1408bad:
1409 m_freem(m);
0a7de745 1410 return IPINPUT_FREED;
3e170ce0
A
1411#endif
1412}
1413
1414static void
1415ip_input_second_pass(struct mbuf *m, struct ifnet *inifp, u_int32_t div_info,
1416 int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args, int ours)
1417{
0a7de745
A
1418 unsigned int checkif;
1419 struct mbuf *tmp_mbuf = NULL;
1420 struct in_ifaddr *ia = NULL;
1421 struct in_addr pkt_dst;
1422 unsigned int hlen;
3e170ce0
A
1423
1424#if !IPFIREWALL
1425#pragma unused (args)
1426#endif
1427
1428#if !IPDIVERT
1429#pragma unused (div_info)
1430#endif
1431
1432 struct ip *ip = mtod(m, struct ip *);
1433 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1434
1435 OSAddAtomic(npkts_in_chain, &ipstat.ips_total);
1436
1437 /*
1438 * Naively assume we can attribute inbound data to the route we would
1439 * use to send to this destination. Asymmetric routing breaks this
1440 * assumption, but it still allows us to account for traffic from
1441 * a remote node in the routing table.
1442 * this has a very significant performance impact so we bypass
1443 * if nstat_collect is disabled. We may also bypass if the
1444 * protocol is tcp in the future because tcp will have a route that
1445 * we can use to attribute the data to. That does mean we would not
1446 * account for forwarded tcp traffic.
1447 */
1448 ip_input_update_nstat(inifp, ip->ip_src, npkts_in_chain,
1449 bytes_in_chain);
1450
0a7de745 1451 if (ours) {
3e170ce0 1452 goto ours;
0a7de745 1453 }
3e170ce0
A
1454
1455 /*
1456 * Check our list of addresses, to see if the packet is for us.
1457 * If we don't have any addresses, assume any unicast packet
1458 * we receive might be for us (and let the upper layers deal
1459 * with it).
1460 */
1461 tmp_mbuf = m;
1462 if (TAILQ_EMPTY(&in_ifaddrhead)) {
1463 while (tmp_mbuf) {
0a7de745 1464 if (!(tmp_mbuf->m_flags & (M_MCAST | M_BCAST))) {
3e170ce0
A
1465 ip_setdstifaddr_info(tmp_mbuf, inifp->if_index,
1466 NULL);
1467 }
1468 tmp_mbuf = mbuf_nextpkt(tmp_mbuf);
1469 }
1470 goto ours;
1471 }
1472 /*
1473 * Cache the destination address of the packet; this may be
1474 * changed by use of 'ipfw fwd'.
1475 */
1476#if IPFIREWALL
1477 pkt_dst = args->fwai_next_hop == NULL ?
1478 ip->ip_dst : args->fwai_next_hop->sin_addr;
1479#else /* !IPFIREWALL */
1480 pkt_dst = ip->ip_dst;
1481#endif /* !IPFIREWALL */
1482
1483 /*
1484 * Enable a consistency check between the destination address
1485 * and the arrival interface for a unicast packet (the RFC 1122
1486 * strong ES model) if IP forwarding is disabled and the packet
1487 * is not locally generated and the packet is not subject to
1488 * 'ipfw fwd'.
1489 *
1490 * XXX - Checking also should be disabled if the destination
1491 * address is ipnat'ed to a different interface.
1492 *
1493 * XXX - Checking is incompatible with IP aliases added
1494 * to the loopback interface instead of the interface where
1495 * the packets are received.
1496 */
1497 checkif = ip_checkinterface && (ipforwarding == 0) &&
1498 !(inifp->if_flags & IFF_LOOPBACK) &&
1499 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)
1500#if IPFIREWALL
1501 && (args->fwai_next_hop == NULL);
1502#else /* !IPFIREWALL */
0a7de745 1503 ;
3e170ce0
A
1504#endif /* !IPFIREWALL */
1505
1506 /*
1507 * Check for exact addresses in the hash bucket.
1508 */
1509 lck_rw_lock_shared(in_ifaddr_rwlock);
1510 TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
1511 /*
1512 * If the address matches, verify that the packet
1513 * arrived via the correct interface if checking is
1514 * enabled.
1515 */
1516 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
1517 (!checkif || ia->ia_ifp == inifp)) {
1518 ip_input_setdst_chain(m, 0, ia);
1519 lck_rw_done(in_ifaddr_rwlock);
1520 goto ours;
1521 }
1522 }
1523 lck_rw_done(in_ifaddr_rwlock);
1524
1525 /*
1526 * Check for broadcast addresses.
1527 *
1528 * Only accept broadcast packets that arrive via the matching
1529 * interface. Reception of forwarded directed broadcasts would be
1530 * handled via ip_forward() and ether_frameout() with the loopback
1531 * into the stack for SIMPLEX interfaces handled by ether_frameout().
1532 */
1533 if (inifp->if_flags & IFF_BROADCAST) {
1534 struct ifaddr *ifa;
1535
1536 ifnet_lock_shared(inifp);
1537 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
1538 if (ifa->ifa_addr->sa_family != AF_INET) {
1539 continue;
1540 }
1541 ia = ifatoia(ifa);
1542 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
1543 pkt_dst.s_addr || ia->ia_netbroadcast.s_addr ==
1544 pkt_dst.s_addr) {
1545 ip_input_setdst_chain(m, 0, ia);
1546 ifnet_lock_done(inifp);
1547 goto ours;
1548 }
1549 }
1550 ifnet_lock_done(inifp);
1551 }
1552
1553 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1554 struct in_multi *inm;
1555 /*
1556 * See if we belong to the destination multicast group on the
1557 * arrival interface.
1558 */
1559 in_multihead_lock_shared();
1560 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
1561 in_multihead_lock_done();
1562 if (inm == NULL) {
1563 OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember);
1564 m_freem_list(m);
1565 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1566 return;
1567 }
1568 ip_input_setdst_chain(m, inifp->if_index, NULL);
1569 INM_REMREF(inm);
1570 goto ours;
1571 }
1572
1573 if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
1574 ip->ip_dst.s_addr == INADDR_ANY) {
1575 ip_input_setdst_chain(m, inifp->if_index, NULL);
1576 goto ours;
1577 }
1578
1579 if (ip->ip_p == IPPROTO_UDP) {
1580 struct udpiphdr *ui;
1581 ui = mtod(m, struct udpiphdr *);
1582 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1583 goto ours;
1584 }
1585 }
1586
1587 tmp_mbuf = m;
1588 struct mbuf *nxt_mbuf = NULL;
1589 while (tmp_mbuf) {
1590 nxt_mbuf = mbuf_nextpkt(tmp_mbuf);
1591 /*
1592 * Not for us; forward if possible and desirable.
1593 */
1594 mbuf_setnextpkt(tmp_mbuf, NULL);
1595 if (ipforwarding == 0) {
1596 OSAddAtomic(1, &ipstat.ips_cantforward);
1597 m_freem(tmp_mbuf);
1598 } else {
1599#if IPFIREWALL
1600 ip_forward(tmp_mbuf, 0, args->fwai_next_hop);
1601#else
1602 ip_forward(tmp_mbuf, 0, NULL);
1603#endif
1604 }
1605 tmp_mbuf = nxt_mbuf;
1606 }
1607 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1608 return;
1609ours:
1610 /*
1611 * If offset or IP_MF are set, must reassemble.
1612 */
1613 if (ip->ip_off & ~(IP_DF | IP_RF)) {
1614 VERIFY(npkts_in_chain == 1);
1615 /*
1616 * ip_reass() will return a different mbuf, and update
1617 * the divert info in div_info and args->fwai_divert_rule.
1618 */
1619#if IPDIVERT
1620 m = ip_reass(m, (u_int16_t *)&div_info, &args->fwai_divert_rule);
1621#else
1622 m = ip_reass(m);
1623#endif
0a7de745 1624 if (m == NULL) {
3e170ce0 1625 return;
0a7de745 1626 }
3e170ce0
A
1627 ip = mtod(m, struct ip *);
1628 /* Get the header length of the reassembled packet */
1629 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1630#if IPDIVERT
1631 /* Restore original checksum before diverting packet */
1632 if (div_info != 0) {
1633 VERIFY(npkts_in_chain == 1);
1634#if BYTE_ORDER != BIG_ENDIAN
1635 HTONS(ip->ip_len);
1636 HTONS(ip->ip_off);
1637#endif
1638 ip->ip_sum = 0;
1639 ip->ip_sum = ip_cksum_hdr_in(m, hlen);
1640#if BYTE_ORDER != BIG_ENDIAN
1641 NTOHS(ip->ip_off);
1642 NTOHS(ip->ip_len);
1643#endif
1644 }
1645#endif
1646 }
1647
1648 /*
1649 * Further protocols expect the packet length to be w/o the
1650 * IP header.
1651 */
1652 ip->ip_len -= hlen;
1653
1654#if IPDIVERT
1655 /*
1656 * Divert or tee packet to the divert protocol if required.
1657 *
1658 * If div_info is zero then cookie should be too, so we shouldn't
1659 * need to clear them here. Assume divert_packet() does so also.
1660 */
1661 if (div_info != 0) {
1662 struct mbuf *clone = NULL;
1663 VERIFY(npkts_in_chain == 1);
1664
1665 /* Clone packet if we're doing a 'tee' */
0a7de745 1666 if (div_info & IP_FW_PORT_TEE_FLAG) {
3e170ce0 1667 clone = m_dup(m, M_DONTWAIT);
0a7de745 1668 }
3e170ce0
A
1669
1670 /* Restore packet header fields to original values */
1671 ip->ip_len += hlen;
1672
1673#if BYTE_ORDER != BIG_ENDIAN
1674 HTONS(ip->ip_len);
1675 HTONS(ip->ip_off);
1676#endif
1677 /* Deliver packet to divert input routine */
1678 OSAddAtomic(1, &ipstat.ips_delivered);
1679 divert_packet(m, 1, div_info & 0xffff, args->fwai_divert_rule);
1680
1681 /* If 'tee', continue with original packet */
1682 if (clone == NULL) {
1683 return;
1684 }
1685 m = clone;
1686 ip = mtod(m, struct ip *);
1687 }
1688#endif
1689
1690#if IPSEC
39236c6e 1691 /*
3e170ce0
A
1692 * enforce IPsec policy checking if we are seeing last header.
1693 * note that we do not visit this with protocols with pcb layer
1694 * code - like udp/tcp/raw ip.
39236c6e 1695 */
3e170ce0
A
1696 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
1697 VERIFY(npkts_in_chain == 1);
1698 if (ipsec4_in_reject(m, NULL)) {
1699 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1700 goto bad;
91447636 1701 }
39236c6e 1702 }
3e170ce0 1703#endif /* IPSEC */
91447636 1704
3e170ce0
A
1705 /*
1706 * Switch out to protocol's input routine.
1707 */
1708 OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered);
39236c6e 1709
3e170ce0
A
1710#if IPFIREWALL
1711 if (args->fwai_next_hop && ip->ip_p == IPPROTO_TCP) {
1712 /* TCP needs IPFORWARD info if available */
1713 struct m_tag *fwd_tag;
1714 struct ip_fwd_tag *ipfwd_tag;
39236c6e 1715
3e170ce0
A
1716 VERIFY(npkts_in_chain == 1);
1717 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
0a7de745 1718 KERNEL_TAG_TYPE_IPFORWARD, sizeof(*ipfwd_tag),
3e170ce0 1719 M_NOWAIT, m);
0a7de745 1720 if (fwd_tag == NULL) {
3e170ce0 1721 goto bad;
0a7de745 1722 }
39236c6e 1723
0a7de745 1724 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag + 1);
3e170ce0 1725 ipfwd_tag->next_hop = args->fwai_next_hop;
1c79356b 1726
3e170ce0 1727 m_tag_prepend(m, fwd_tag);
b0d623f7 1728
3e170ce0
A
1729 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1730 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
b0d623f7 1731
3e170ce0
A
1732 /* TCP deals with its own locking */
1733 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
1734 } else {
1735 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1736 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
b0d623f7 1737
3e170ce0 1738 ip_input_dispatch_chain(m);
b0d623f7 1739 }
3e170ce0
A
1740#else /* !IPFIREWALL */
1741 ip_input_dispatch_chain(m);
b0d623f7 1742
3e170ce0
A
1743#endif /* !IPFIREWALL */
1744 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1745 return;
1746bad:
1747 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1748 m_freem(m);
b0d623f7
A
1749}
1750
316670eb 1751void
3e170ce0 1752ip_input_process_list(struct mbuf *packet_list)
91447636 1753{
0a7de745 1754 pktchain_elm_t pktchain_tbl[PKTTBL_SZ];
3e170ce0 1755
0a7de745
A
1756 struct mbuf *packet = NULL;
1757 struct mbuf *modm = NULL; /* modified mbuf */
1758 int retval = 0;
1759 u_int32_t div_info = 0;
1760 int ours = 0;
39037602 1761#if (DEBUG || DEVELOPMENT)
3e170ce0 1762 struct timeval start_tv;
39037602 1763#endif /* (DEBUG || DEVELOPMENT) */
0a7de745 1764 int num_pkts = 0;
3e170ce0
A
1765 int chain = 0;
1766 struct ip_fw_in_args args;
1767
1768 if (ip_chaining == 0) {
1769 struct mbuf *m = packet_list;
39037602 1770#if (DEBUG || DEVELOPMENT)
0a7de745 1771 if (ip_input_measure) {
3e170ce0 1772 net_perf_start_time(&net_perf, &start_tv);
0a7de745 1773 }
39037602
A
1774#endif /* (DEBUG || DEVELOPMENT) */
1775
3e170ce0
A
1776 while (m) {
1777 packet_list = mbuf_nextpkt(m);
1778 mbuf_setnextpkt(m, NULL);
1779 ip_input(m);
1780 m = packet_list;
1781 num_pkts++;
1782 }
39037602 1783#if (DEBUG || DEVELOPMENT)
0a7de745 1784 if (ip_input_measure) {
3e170ce0 1785 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
0a7de745 1786 }
39037602 1787#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
1788 return;
1789 }
39037602 1790#if (DEBUG || DEVELOPMENT)
0a7de745 1791 if (ip_input_measure) {
3e170ce0 1792 net_perf_start_time(&net_perf, &start_tv);
0a7de745 1793 }
39037602 1794#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
1795
1796 bzero(&pktchain_tbl, sizeof(pktchain_tbl));
1797restart_list_process:
1798 chain = 0;
1799 for (packet = packet_list; packet; packet = packet_list) {
1800 packet_list = mbuf_nextpkt(packet);
1801 mbuf_setnextpkt(packet, NULL);
1802
1803 num_pkts++;
1804 modm = NULL;
1805 div_info = 0;
0a7de745 1806 bzero(&args, sizeof(args));
3e170ce0
A
1807
1808 retval = ip_input_first_pass(packet, &div_info, &args,
1809 &ours, &modm);
1810
1811 if (retval == IPINPUT_DOCHAIN) {
0a7de745 1812 if (modm) {
3e170ce0 1813 packet = modm;
0a7de745 1814 }
3e170ce0
A
1815 packet = ip_chain_insert(packet, &pktchain_tbl[0]);
1816 if (packet == NULL) {
1817 ipstat.ips_rxc_chained++;
1818 chain++;
0a7de745 1819 if (chain > ip_chainsz) {
3e170ce0 1820 break;
0a7de745 1821 }
3e170ce0
A
1822 } else {
1823 ipstat.ips_rxc_collisions++;
1824 break;
316670eb 1825 }
3e170ce0
A
1826 } else if (retval == IPINPUT_DONTCHAIN) {
1827 /* in order to preserve order, exit from chaining */
0a7de745 1828 if (modm) {
3e170ce0 1829 packet = modm;
0a7de745 1830 }
3e170ce0
A
1831 ipstat.ips_rxc_notchain++;
1832 break;
1833 } else {
1834 /* packet was freed or delivered, do nothing. */
91447636 1835 }
91447636 1836 }
316670eb 1837
3e170ce0 1838 /* do second pass here for pktchain_tbl */
0a7de745 1839 if (chain) {
3e170ce0 1840 ip_input_second_pass_loop_tbl(&pktchain_tbl[0], &args);
0a7de745 1841 }
316670eb 1842
3e170ce0
A
1843 if (packet) {
1844 /*
1845 * equivalent update in chaining case if performed in
1846 * ip_input_second_pass_loop_tbl().
1847 */
39037602 1848#if (DEBUG || DEVELOPMENT)
0a7de745 1849 if (ip_input_measure) {
3e170ce0 1850 net_perf_histogram(&net_perf, 1);
0a7de745 1851 }
39037602 1852#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0
A
1853 ip_input_second_pass(packet, packet->m_pkthdr.rcvif, div_info,
1854 1, packet->m_pkthdr.len, &args, ours);
91447636 1855 }
0b4c1975 1856
0a7de745 1857 if (packet_list) {
3e170ce0 1858 goto restart_list_process;
0a7de745 1859 }
91447636 1860
39037602 1861#if (DEBUG || DEVELOPMENT)
0a7de745 1862 if (ip_input_measure) {
3e170ce0 1863 net_perf_measure_time(&net_perf, &start_tv, num_pkts);
0a7de745 1864 }
39037602 1865#endif /* (DEBUG || DEVELOPMENT) */
3e170ce0 1866}
1c79356b
A
1867/*
1868 * Ip input routine. Checksum and byte swap header. If fragmented
1869 * try to reassemble. Process options. Pass to next level.
1870 */
1871void
1872ip_input(struct mbuf *m)
1873{
1874 struct ip *ip;
9bccf70c 1875 struct in_ifaddr *ia = NULL;
39236c6e 1876 unsigned int hlen, checkif;
316670eb 1877 u_short sum = 0;
9bccf70c 1878 struct in_addr pkt_dst;
4a3eedf9 1879#if IPFIREWALL
0b4c1975 1880 int i;
0a7de745 1881 u_int32_t div_info = 0; /* packet divert/tee info */
316670eb
A
1882#endif
1883#if IPFIREWALL || DUMMYNET
91447636 1884 struct ip_fw_args args;
0a7de745 1885 struct m_tag *tag;
4a3eedf9 1886#endif
39236c6e
A
1887 ipfilter_t inject_filter_ref = NULL;
1888 struct ifnet *inifp;
b0d623f7 1889
6d2010ae
A
1890 /* Check if the mbuf is still valid after interface filter processing */
1891 MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
39236c6e
A
1892 inifp = m->m_pkthdr.rcvif;
1893 VERIFY(inifp != NULL);
6d2010ae 1894
3e170ce0
A
1895 ipstat.ips_rxc_notlist++;
1896
316670eb 1897 /* Perform IP header alignment fixup, if needed */
39236c6e
A
1898 IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
1899
1900 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
316670eb
A
1901
1902#if IPFIREWALL || DUMMYNET
0a7de745 1903 bzero(&args, sizeof(struct ip_fw_args));
91447636 1904
b0d623f7
A
1905 /*
1906 * Don't bother searching for tag(s) if there's none.
1907 */
0a7de745 1908 if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
b0d623f7 1909 goto ipfw_tags_done;
0a7de745 1910 }
b0d623f7 1911
91447636
A
1912 /* Grab info from mtags prepended to the chain */
1913#if DUMMYNET
b0d623f7
A
1914 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1915 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
39236c6e 1916 struct dn_pkt_tag *dn_tag;
b0d623f7 1917
0a7de745 1918 dn_tag = (struct dn_pkt_tag *)(tag + 1);
316670eb
A
1919 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
1920 args.fwa_pf_rule = dn_tag->dn_pf_rule;
b0d623f7 1921
91447636
A
1922 m_tag_delete(m, tag);
1923 }
1924#endif /* DUMMYNET */
9bccf70c 1925
4a3eedf9 1926#if IPDIVERT
b0d623f7
A
1927 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1928 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
39236c6e 1929 struct divert_tag *div_tag;
b0d623f7 1930
0a7de745 1931 div_tag = (struct divert_tag *)(tag + 1);
316670eb 1932 args.fwa_divert_rule = div_tag->cookie;
1c79356b 1933
91447636
A
1934 m_tag_delete(m, tag);
1935 }
4a3eedf9
A
1936#endif
1937
b0d623f7
A
1938 if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1939 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
39236c6e 1940 struct ip_fwd_tag *ipfwd_tag;
b0d623f7 1941
0a7de745 1942 ipfwd_tag = (struct ip_fwd_tag *)(tag + 1);
316670eb 1943 args.fwa_next_hop = ipfwd_tag->next_hop;
1c79356b 1944
91447636
A
1945 m_tag_delete(m, tag);
1946 }
b0d623f7 1947
0a7de745
A
1948#if DIAGNOSTIC
1949 if (m == NULL || !(m->m_flags & M_PKTHDR)) {
1c79356b 1950 panic("ip_input no HDR");
0a7de745 1951 }
1c79356b 1952#endif
91447636 1953
316670eb 1954#if DUMMYNET
39236c6e
A
1955 if (args.fwa_ipfw_rule || args.fwa_pf_rule) {
1956 /* dummynet already filtered us */
b0d623f7
A
1957 ip = mtod(m, struct ip *);
1958 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1959 inject_filter_ref = ipf_get_inject_filter(m);
316670eb 1960#if IPFIREWALL
0a7de745 1961 if (args.fwa_ipfw_rule) {
316670eb 1962 goto iphack;
0a7de745 1963 }
316670eb 1964#endif /* IPFIREWALL */
0a7de745 1965 if (args.fwa_pf_rule) {
316670eb 1966 goto check_with_pf;
0a7de745 1967 }
91447636 1968 }
316670eb 1969#endif /* DUMMYNET */
b0d623f7 1970ipfw_tags_done:
39236c6e 1971#endif /* IPFIREWALL || DUMMYNET */
b0d623f7 1972
91447636 1973 /*
316670eb 1974 * No need to process packet twice if we've already seen it.
91447636 1975 */
0a7de745 1976 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
b0d623f7 1977 inject_filter_ref = ipf_get_inject_filter(m);
0a7de745 1978 }
39236c6e 1979 if (inject_filter_ref != NULL) {
91447636
A
1980 ip = mtod(m, struct ip *);
1981 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
6d2010ae 1982
39236c6e
A
1983 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
1984 struct ip *, ip, struct ifnet *, inifp,
1985 struct ip *, ip, struct ip6_hdr *, NULL);
1986
91447636
A
1987 ip->ip_len = ntohs(ip->ip_len) - hlen;
1988 ip->ip_off = ntohs(ip->ip_off);
1989 ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref);
1990 return;
1991 }
1992
b0d623f7 1993 OSAddAtomic(1, &ipstat.ips_total);
0a7de745 1994 if (m->m_pkthdr.len < sizeof(struct ip)) {
1c79356b 1995 goto tooshort;
0a7de745 1996 }
1c79356b 1997
0a7de745
A
1998 if (m->m_len < sizeof(struct ip) &&
1999 (m = m_pullup(m, sizeof(struct ip))) == NULL) {
b0d623f7 2000 OSAddAtomic(1, &ipstat.ips_toosmall);
1c79356b
A
2001 return;
2002 }
2003 ip = mtod(m, struct ip *);
2004
39236c6e
A
2005 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
2006 ip->ip_p, ip->ip_off, ip->ip_len);
1c79356b
A
2007
2008 if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
b0d623f7 2009 OSAddAtomic(1, &ipstat.ips_badvers);
1c79356b
A
2010 goto bad;
2011 }
2012
2013 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
0a7de745 2014 if (hlen < sizeof(struct ip)) { /* minimum header length */
b0d623f7 2015 OSAddAtomic(1, &ipstat.ips_badhlen);
1c79356b
A
2016 goto bad;
2017 }
2018 if (hlen > m->m_len) {
39236c6e 2019 if ((m = m_pullup(m, hlen)) == NULL) {
b0d623f7 2020 OSAddAtomic(1, &ipstat.ips_badhlen);
1c79356b
A
2021 return;
2022 }
2023 ip = mtod(m, struct ip *);
2024 }
2025
9bccf70c
A
2026 /* 127/8 must not appear on wire - RFC1122 */
2027 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
2028 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
39236c6e
A
2029 /*
2030 * Allow for the following exceptions:
2031 *
2032 * 1. If the packet was sent to loopback (i.e. rcvif
2033 * would have been set earlier at output time.)
2034 *
2035 * 2. If the packet was sent out on loopback from a local
2036 * source address which belongs to a non-loopback
2037 * interface (i.e. rcvif may not necessarily be a
2038 * loopback interface, hence the test for PKTF_LOOP.)
2039 * Unlike IPv6, there is no interface scope ID, and
2040 * therefore we don't care so much about PKTF_IFINFO.
2041 */
2042 if (!(inifp->if_flags & IFF_LOOPBACK) &&
2043 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
b0d623f7 2044 OSAddAtomic(1, &ipstat.ips_badaddr);
9bccf70c
A
2045 goto bad;
2046 }
2047 }
2048
39236c6e
A
2049 /* IPv4 Link-Local Addresses as defined in RFC3927 */
2050 if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
9bccf70c
A
2051 IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
2052 ip_linklocal_stat.iplls_in_total++;
2053 if (ip->ip_ttl != MAXTTL) {
b0d623f7 2054 OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl);
9bccf70c 2055 /* Silently drop link local traffic with bad TTL */
0a7de745 2056 if (!ip_linklocal_in_allowbadttl) {
9bccf70c 2057 goto bad;
0a7de745 2058 }
9bccf70c
A
2059 }
2060 }
1c79356b 2061
316670eb 2062 sum = ip_cksum(m, hlen);
1c79356b 2063 if (sum) {
1c79356b
A
2064 goto bad;
2065 }
2066
39236c6e
A
2067 DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
2068 struct ip *, ip, struct ifnet *, inifp,
2069 struct ip *, ip, struct ip6_hdr *, NULL);
6d2010ae
A
2070
2071 /*
2072 * Naively assume we can attribute inbound data to the route we would
3e170ce0 2073 * use to send to this destination. Asymmetric routing breaks this
6d2010ae
A
2074 * assumption, but it still allows us to account for traffic from
2075 * a remote node in the routing table.
2076 * this has a very significant performance impact so we bypass
2077 * if nstat_collect is disabled. We may also bypass if the
2078 * protocol is tcp in the future because tcp will have a route that
2079 * we can use to attribute the data to. That does mean we would not
2080 * account for forwarded tcp traffic.
2081 */
2082 if (nstat_collect) {
2083 struct rtentry *rt =
39236c6e 2084 ifnet_cached_rtlookup_inet(inifp, ip->ip_src);
6d2010ae
A
2085 if (rt != NULL) {
2086 nstat_route_rx(rt, 1, m->m_pkthdr.len, 0);
2087 rtfree(rt);
2088 }
2089 }
2090
1c79356b
A
2091 /*
2092 * Convert fields to host representation.
2093 */
b0d623f7 2094#if BYTE_ORDER != BIG_ENDIAN
1c79356b 2095 NTOHS(ip->ip_len);
b0d623f7 2096#endif
39236c6e 2097
1c79356b 2098 if (ip->ip_len < hlen) {
b0d623f7 2099 OSAddAtomic(1, &ipstat.ips_badlen);
1c79356b
A
2100 goto bad;
2101 }
1c79356b 2102
b0d623f7
A
2103#if BYTE_ORDER != BIG_ENDIAN
2104 NTOHS(ip->ip_off);
2105#endif
1c79356b
A
2106 /*
2107 * Check that the amount of data in the buffers
2108 * is as at least much as the IP header would have us expect.
2109 * Trim mbufs if longer than we expect.
2110 * Drop packet if shorter than we expect.
2111 */
2112 if (m->m_pkthdr.len < ip->ip_len) {
2113tooshort:
b0d623f7 2114 OSAddAtomic(1, &ipstat.ips_tooshort);
1c79356b
A
2115 goto bad;
2116 }
2117 if (m->m_pkthdr.len > ip->ip_len) {
5ba3f43e 2118 ip_input_adjust(m, ip, inifp);
1c79356b 2119 }
9bccf70c 2120
316670eb
A
2121#if DUMMYNET
2122check_with_pf:
2123#endif
b0d623f7
A
2124#if PF
2125 /* Invoke inbound packet filter */
316670eb 2126 if (PF_IS_ENABLED) {
6d2010ae 2127 int error;
316670eb 2128#if DUMMYNET
39236c6e 2129 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args);
316670eb 2130#else
39236c6e 2131 error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
316670eb
A
2132#endif /* DUMMYNET */
2133 if (error != 0 || m == NULL) {
6d2010ae 2134 if (m != NULL) {
39236c6e
A
2135 panic("%s: unexpected packet %p\n",
2136 __func__, m);
6d2010ae
A
2137 /* NOTREACHED */
2138 }
2139 /* Already freed by callee */
2140 return;
316670eb 2141 }
6d2010ae
A
2142 ip = mtod(m, struct ip *);
2143 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
b0d623f7 2144 }
b0d623f7 2145#endif /* PF */
1c79356b 2146
6d2010ae 2147#if IPSEC
0a7de745 2148 if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) {
6d2010ae 2149 goto pass;
0a7de745 2150 }
6d2010ae
A
2151#endif
2152
2d21ac55
A
2153#if IPFIREWALL
2154#if DUMMYNET
1c79356b 2155iphack:
2d21ac55 2156#endif /* DUMMYNET */
9bccf70c
A
2157 /*
2158 * Check if we want to allow this packet to be processed.
2159 * Consider it to be bad if not.
2160 */
91447636 2161 if (fw_enable && IPFW_LOADED) {
1c79356b
A
2162#if IPFIREWALL_FORWARD
2163 /*
2164 * If we've been forwarded from the output side, then
2165 * skip the firewall a second time
2166 */
0a7de745 2167 if (args.fwa_next_hop) {
1c79356b 2168 goto ours;
0a7de745
A
2169 }
2170#endif /* IPFIREWALL_FORWARD */
91447636 2171
316670eb 2172 args.fwa_m = m;
3a60a9f5 2173
91447636 2174 i = ip_fw_chk_ptr(&args);
316670eb 2175 m = args.fwa_m;
91447636 2176
39236c6e 2177 if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
0a7de745 2178 if (m) {
3a60a9f5 2179 m_freem(m);
0a7de745 2180 }
9bccf70c 2181 return;
91447636 2182 }
9bccf70c 2183 ip = mtod(m, struct ip *); /* just in case m changed */
39236c6e
A
2184
2185 if (i == 0 && args.fwa_next_hop == NULL) { /* common case */
9bccf70c 2186 goto pass;
3a60a9f5 2187 }
1c79356b 2188#if DUMMYNET
39236c6e 2189 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
91447636 2190 /* Send packet to the appropriate pipe */
0a7de745 2191 ip_dn_io_ptr(m, i & 0xffff, DN_TO_IP_IN, &args,
39236c6e 2192 DN_CLIENT_IPFW);
9bccf70c 2193 return;
1c79356b 2194 }
91447636 2195#endif /* DUMMYNET */
1c79356b 2196#if IPDIVERT
9bccf70c
A
2197 if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
2198 /* Divert or tee packet */
91447636 2199 div_info = i;
1c79356b
A
2200 goto ours;
2201 }
2202#endif
2203#if IPFIREWALL_FORWARD
316670eb 2204 if (i == 0 && args.fwa_next_hop != NULL) {
9bccf70c 2205 goto pass;
3a60a9f5 2206 }
1c79356b
A
2207#endif
2208 /*
2209 * if we get here, the packet must be dropped
2210 */
1c79356b 2211 m_freem(m);
9bccf70c 2212 return;
1c79356b 2213 }
2d21ac55 2214#endif /* IPFIREWALL */
39236c6e 2215#if IPSEC | IPFIREWALL
9bccf70c 2216pass:
39236c6e 2217#endif
1c79356b
A
2218 /*
2219 * Process options and, if not destined for us,
2220 * ship it on. ip_dooptions returns 1 when an
2221 * error was detected (causing an icmp message
2222 * to be sent and the original packet to be freed).
2223 */
0a7de745 2224 ip_nhops = 0; /* for source routed packets */
4a3eedf9 2225#if IPFIREWALL
0a7de745 2226 if (hlen > sizeof(struct ip) &&
39236c6e
A
2227 ip_dooptions(m, 0, args.fwa_next_hop)) {
2228#else /* !IPFIREWALL */
0a7de745 2229 if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) {
39236c6e 2230#endif /* !IPFIREWALL */
1c79356b
A
2231 return;
2232 }
2233
1c79356b
A
2234 /*
2235 * Check our list of addresses, to see if the packet is for us.
9bccf70c
A
2236 * If we don't have any addresses, assume any unicast packet
2237 * we receive might be for us (and let the upper layers deal
2238 * with it).
1c79356b 2239 */
0a7de745 2240 if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST | M_BCAST))) {
39236c6e 2241 ip_setdstifaddr_info(m, inifp->if_index, NULL);
9bccf70c 2242 goto ours;
39236c6e 2243 }
1c79356b 2244
9bccf70c
A
2245 /*
2246 * Cache the destination address of the packet; this may be
2247 * changed by use of 'ipfw fwd'.
2248 */
4a3eedf9 2249#if IPFIREWALL
316670eb
A
2250 pkt_dst = args.fwa_next_hop == NULL ?
2251 ip->ip_dst : args.fwa_next_hop->sin_addr;
39236c6e 2252#else /* !IPFIREWALL */
4a3eedf9 2253 pkt_dst = ip->ip_dst;
39236c6e 2254#endif /* !IPFIREWALL */
9bccf70c
A
2255
2256 /*
2257 * Enable a consistency check between the destination address
2258 * and the arrival interface for a unicast packet (the RFC 1122
2259 * strong ES model) if IP forwarding is disabled and the packet
2260 * is not locally generated and the packet is not subject to
2261 * 'ipfw fwd'.
2262 *
2263 * XXX - Checking also should be disabled if the destination
2264 * address is ipnat'ed to a different interface.
2265 *
2266 * XXX - Checking is incompatible with IP aliases added
2267 * to the loopback interface instead of the interface where
2268 * the packets are received.
2269 */
39236c6e
A
2270 checkif = ip_checkinterface && (ipforwarding == 0) &&
2271 !(inifp->if_flags & IFF_LOOPBACK) &&
2272 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)
4a3eedf9 2273#if IPFIREWALL
316670eb 2274 && (args.fwa_next_hop == NULL);
39236c6e 2275#else /* !IPFIREWALL */
0a7de745 2276 ;
39236c6e 2277#endif /* !IPFIREWALL */
9bccf70c 2278
b0d623f7
A
2279 /*
2280 * Check for exact addresses in the hash bucket.
2281 */
2282 lck_rw_lock_shared(in_ifaddr_rwlock);
2283 TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
1c79356b 2284 /*
9bccf70c
A
2285 * If the address matches, verify that the packet
2286 * arrived via the correct interface if checking is
2287 * enabled.
1c79356b 2288 */
39236c6e
A
2289 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
2290 (!checkif || ia->ia_ifp == inifp)) {
2291 ip_setdstifaddr_info(m, 0, ia);
b0d623f7 2292 lck_rw_done(in_ifaddr_rwlock);
1c79356b 2293 goto ours;
91447636 2294 }
b0d623f7
A
2295 }
2296 lck_rw_done(in_ifaddr_rwlock);
2297
2298 /*
2299 * Check for broadcast addresses.
2300 *
2301 * Only accept broadcast packets that arrive via the matching
2302 * interface. Reception of forwarded directed broadcasts would be
2303 * handled via ip_forward() and ether_frameout() with the loopback
2304 * into the stack for SIMPLEX interfaces handled by ether_frameout().
2305 */
39236c6e 2306 if (inifp->if_flags & IFF_BROADCAST) {
b0d623f7 2307 struct ifaddr *ifa;
39236c6e
A
2308
2309 ifnet_lock_shared(inifp);
2310 TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
6d2010ae 2311 if (ifa->ifa_addr->sa_family != AF_INET) {
b0d623f7 2312 continue;
6d2010ae 2313 }
b0d623f7 2314 ia = ifatoia(ifa);
1c79356b 2315 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
b0d623f7 2316 pkt_dst.s_addr || ia->ia_netbroadcast.s_addr ==
91447636 2317 pkt_dst.s_addr) {
39236c6e
A
2318 ip_setdstifaddr_info(m, 0, ia);
2319 ifnet_lock_done(inifp);
1c79356b 2320 goto ours;
91447636 2321 }
1c79356b 2322 }
39236c6e 2323 ifnet_lock_done(inifp);
1c79356b 2324 }
b0d623f7 2325
1c79356b
A
2326 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
2327 struct in_multi *inm;
1c79356b
A
2328 /*
2329 * See if we belong to the destination multicast group on the
2330 * arrival interface.
2331 */
6d2010ae 2332 in_multihead_lock_shared();
39236c6e 2333 IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
6d2010ae 2334 in_multihead_lock_done();
1c79356b 2335 if (inm == NULL) {
b0d623f7 2336 OSAddAtomic(1, &ipstat.ips_notmember);
1c79356b
A
2337 m_freem(m);
2338 return;
2339 }
39236c6e 2340 ip_setdstifaddr_info(m, inifp->if_index, NULL);
6d2010ae 2341 INM_REMREF(inm);
1c79356b
A
2342 goto ours;
2343 }
39236c6e
A
2344 if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST ||
2345 ip->ip_dst.s_addr == INADDR_ANY) {
2346 ip_setdstifaddr_info(m, inifp->if_index, NULL);
1c79356b 2347 goto ours;
39236c6e 2348 }
1c79356b 2349
9bccf70c 2350 /* Allow DHCP/BootP responses through */
39236c6e 2351 if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
0a7de745 2352 hlen == sizeof(struct ip) && ip->ip_p == IPPROTO_UDP) {
9bccf70c 2353 struct udpiphdr *ui;
39236c6e 2354
0a7de745
A
2355 if (m->m_len < sizeof(struct udpiphdr) &&
2356 (m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
b0d623f7 2357 OSAddAtomic(1, &udpstat.udps_hdrops);
9bccf70c
A
2358 return;
2359 }
2360 ui = mtod(m, struct udpiphdr *);
2361 if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
39236c6e 2362 ip_setdstifaddr_info(m, inifp->if_index, NULL);
9bccf70c
A
2363 goto ours;
2364 }
2365 ip = mtod(m, struct ip *); /* in case it changed */
0b4e3aa0
A
2366 }
2367
1c79356b
A
2368 /*
2369 * Not for us; forward if possible and desirable.
2370 */
2371 if (ipforwarding == 0) {
b0d623f7 2372 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b 2373 m_freem(m);
91447636 2374 } else {
4a3eedf9 2375#if IPFIREWALL
316670eb 2376 ip_forward(m, 0, args.fwa_next_hop);
4a3eedf9 2377#else
b0d623f7 2378 ip_forward(m, 0, NULL);
4a3eedf9 2379#endif
91447636 2380 }
1c79356b
A
2381 return;
2382
2383ours:
1c79356b
A
2384 /*
2385 * If offset or IP_MF are set, must reassemble.
1c79356b 2386 */
39236c6e 2387 if (ip->ip_off & ~(IP_DF | IP_RF)) {
1c79356b 2388 /*
483a1d10 2389 * ip_reass() will return a different mbuf, and update
316670eb 2390 * the divert info in div_info and args.fwa_divert_rule.
1c79356b 2391 */
9bccf70c 2392#if IPDIVERT
39236c6e 2393 m = ip_reass(m, (u_int16_t *)&div_info, &args.fwa_divert_rule);
9bccf70c 2394#else
39236c6e 2395 m = ip_reass(m);
9bccf70c 2396#endif
0a7de745 2397 if (m == NULL) {
39236c6e 2398 return;
0a7de745 2399 }
39236c6e
A
2400 ip = mtod(m, struct ip *);
2401 /* Get the header length of the reassembled packet */
2402 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1c79356b 2403#if IPDIVERT
39236c6e
A
2404 /* Restore original checksum before diverting packet */
2405 if (div_info != 0) {
b0d623f7 2406#if BYTE_ORDER != BIG_ENDIAN
39236c6e
A
2407 HTONS(ip->ip_len);
2408 HTONS(ip->ip_off);
b0d623f7 2409#endif
39236c6e
A
2410 ip->ip_sum = 0;
2411 ip->ip_sum = ip_cksum_hdr_in(m, hlen);
b0d623f7 2412#if BYTE_ORDER != BIG_ENDIAN
39236c6e
A
2413 NTOHS(ip->ip_off);
2414 NTOHS(ip->ip_len);
b0d623f7 2415#endif
39236c6e 2416 }
1c79356b 2417#endif
39236c6e
A
2418 }
2419
2420 /*
2421 * Further protocols expect the packet length to be w/o the
2422 * IP header.
2423 */
2424 ip->ip_len -= hlen;
1c79356b
A
2425
2426#if IPDIVERT
2427 /*
9bccf70c
A
2428 * Divert or tee packet to the divert protocol if required.
2429 *
91447636 2430 * If div_info is zero then cookie should be too, so we shouldn't
9bccf70c 2431 * need to clear them here. Assume divert_packet() does so also.
1c79356b 2432 */
91447636 2433 if (div_info != 0) {
9bccf70c
A
2434 struct mbuf *clone = NULL;
2435
2436 /* Clone packet if we're doing a 'tee' */
0a7de745 2437 if (div_info & IP_FW_PORT_TEE_FLAG) {
9bccf70c 2438 clone = m_dup(m, M_DONTWAIT);
0a7de745 2439 }
9bccf70c
A
2440
2441 /* Restore packet header fields to original values */
2442 ip->ip_len += hlen;
b0d623f7
A
2443
2444#if BYTE_ORDER != BIG_ENDIAN
9bccf70c
A
2445 HTONS(ip->ip_len);
2446 HTONS(ip->ip_off);
b0d623f7 2447#endif
9bccf70c 2448 /* Deliver packet to divert input routine */
b0d623f7 2449 OSAddAtomic(1, &ipstat.ips_delivered);
316670eb 2450 divert_packet(m, 1, div_info & 0xffff, args.fwa_divert_rule);
9bccf70c
A
2451
2452 /* If 'tee', continue with original packet */
91447636 2453 if (clone == NULL) {
9bccf70c 2454 return;
91447636 2455 }
9bccf70c
A
2456 m = clone;
2457 ip = mtod(m, struct ip *);
1c79356b 2458 }
9bccf70c 2459#endif
1c79356b 2460
9bccf70c
A
2461#if IPSEC
2462 /*
2463 * enforce IPsec policy checking if we are seeing last header.
2464 * note that we do not visit this with protocols with pcb layer
2465 * code - like udp/tcp/raw ip.
2466 */
39236c6e 2467 if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
91447636 2468 if (ipsec4_in_reject(m, NULL)) {
2d21ac55 2469 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
39236c6e 2470 goto bad;
91447636 2471 }
1c79356b 2472 }
39236c6e 2473#endif /* IPSEC */
1c79356b
A
2474
2475 /*
2476 * Switch out to protocol's input routine.
2477 */
b0d623f7 2478 OSAddAtomic(1, &ipstat.ips_delivered);
39236c6e 2479
4a3eedf9 2480#if IPFIREWALL
39236c6e
A
2481 if (args.fwa_next_hop && ip->ip_p == IPPROTO_TCP) {
2482 /* TCP needs IPFORWARD info if available */
2483 struct m_tag *fwd_tag;
2484 struct ip_fwd_tag *ipfwd_tag;
2485
2486 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
0a7de745 2487 KERNEL_TAG_TYPE_IPFORWARD, sizeof(*ipfwd_tag),
39236c6e 2488 M_NOWAIT, m);
0a7de745 2489 if (fwd_tag == NULL) {
39236c6e 2490 goto bad;
0a7de745 2491 }
39236c6e 2492
0a7de745 2493 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag + 1);
39236c6e
A
2494 ipfwd_tag->next_hop = args.fwa_next_hop;
2495
2496 m_tag_prepend(m, fwd_tag);
2497
2498 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
2499 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
2500
2501 /* TCP deals with its own locking */
2502 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2503 } else {
2504 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
2505 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
2506
316670eb
A
2507 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) {
2508 m = tcp_lro(m, hlen);
0a7de745 2509 if (m == NULL) {
316670eb 2510 return;
0a7de745 2511 }
316670eb 2512 }
39236c6e 2513
4a3eedf9 2514 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
9bccf70c 2515 }
39236c6e
A
2516#else /* !IPFIREWALL */
2517 if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) {
2518 m = tcp_lro(m, hlen);
0a7de745 2519 if (m == NULL) {
39236c6e 2520 return;
0a7de745 2521 }
39236c6e
A
2522 }
2523 ip_proto_dispatch_in(m, hlen, ip->ip_p, 0);
2524#endif /* !IPFIREWALL */
2525 return;
2526
1c79356b 2527bad:
39236c6e 2528 KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0);
1c79356b
A
2529 m_freem(m);
2530}
2531
39236c6e
A
2532static void
2533ipq_updateparams(void)
2534{
5ba3f43e 2535 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
2536 /*
2537 * -1 for unlimited allocation.
2538 */
0a7de745 2539 if (maxnipq < 0) {
39236c6e 2540 ipq_limit = 0;
0a7de745 2541 }
39236c6e
A
2542 /*
2543 * Positive number for specific bound.
2544 */
0a7de745 2545 if (maxnipq > 0) {
39236c6e 2546 ipq_limit = maxnipq;
0a7de745 2547 }
39236c6e
A
2548 /*
2549 * Zero specifies no further fragment queue allocation -- set the
2550 * bound very low, but rely on implementation elsewhere to actually
2551 * prevent allocation and reclaim current queues.
2552 */
0a7de745 2553 if (maxnipq == 0) {
39236c6e 2554 ipq_limit = 1;
0a7de745 2555 }
39236c6e
A
2556 /*
2557 * Arm the purge timer if not already and if there's work to do
2558 */
2559 frag_sched_timeout();
2560}
2561
2562static int
2563sysctl_maxnipq SYSCTL_HANDLER_ARGS
2564{
2565#pragma unused(arg1, arg2)
2566 int error, i;
2567
2568 lck_mtx_lock(&ipqlock);
2569 i = maxnipq;
2570 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 2571 if (error || req->newptr == USER_ADDR_NULL) {
39236c6e 2572 goto done;
0a7de745 2573 }
39236c6e
A
2574 /* impose bounds */
2575 if (i < -1 || i > (nmbclusters / 4)) {
2576 error = EINVAL;
2577 goto done;
2578 }
2579 maxnipq = i;
2580 ipq_updateparams();
2581done:
2582 lck_mtx_unlock(&ipqlock);
0a7de745 2583 return error;
39236c6e
A
2584}
2585
2586static int
2587sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS
2588{
2589#pragma unused(arg1, arg2)
2590 int error, i;
2591
2592 lck_mtx_lock(&ipqlock);
2593 i = maxfragsperpacket;
2594 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 2595 if (error || req->newptr == USER_ADDR_NULL) {
39236c6e 2596 goto done;
0a7de745 2597 }
39236c6e 2598 maxfragsperpacket = i;
0a7de745 2599 ipq_updateparams(); /* see if we need to arm timer */
39236c6e
A
2600done:
2601 lck_mtx_unlock(&ipqlock);
0a7de745 2602 return error;
39236c6e
A
2603}
2604
1c79356b 2605/*
9bccf70c
A
2606 * Take incoming datagram fragment and try to reassemble it into
2607 * whole datagram. If a chain for reassembly of this datagram already
2608 * exists, then it is given as fp; otherwise have to make a chain.
2609 *
2610 * When IPDIVERT enabled, keep additional state with each packet that
2611 * tells us if we need to divert or tee the packet we're building.
39236c6e 2612 *
5ba3f43e 2613 * The IP header is *NOT* adjusted out of iplen (but in host byte order).
1c79356b 2614 */
9bccf70c
A
2615static struct mbuf *
2616#if IPDIVERT
39236c6e 2617ip_reass(struct mbuf *m,
9bccf70c 2618#ifdef IPDIVERT_44
39236c6e 2619 u_int32_t *divinfo,
2d21ac55 2620#else /* IPDIVERT_44 */
39236c6e 2621 u_int16_t *divinfo,
2d21ac55 2622#endif /* IPDIVERT_44 */
39236c6e 2623 u_int16_t *divcookie)
2d21ac55 2624#else /* IPDIVERT */
39236c6e 2625ip_reass(struct mbuf *m)
2d21ac55 2626#endif /* IPDIVERT */
1c79356b 2627{
39236c6e
A
2628 struct ip *ip;
2629 struct mbuf *p, *q, *nq, *t;
2630 struct ipq *fp = NULL;
2631 struct ipqhead *head;
2632 int i, hlen, next;
2d21ac55 2633 u_int8_t ecn, ecn0;
39236c6e
A
2634 uint32_t csum, csum_flags;
2635 uint16_t hash;
2636 struct fq_head dfq;
2637
0a7de745 2638 MBUFQ_INIT(&dfq); /* for deferred frees */
39236c6e
A
2639
2640 /* If maxnipq or maxfragsperpacket is 0, never accept fragments. */
2641 if (maxnipq == 0 || maxfragsperpacket == 0) {
2642 ipstat.ips_fragments++;
2643 ipstat.ips_fragdropped++;
2644 m_freem(m);
2645 if (nipq > 0) {
2646 lck_mtx_lock(&ipqlock);
0a7de745 2647 frag_sched_timeout(); /* purge stale fragments */
39236c6e
A
2648 lck_mtx_unlock(&ipqlock);
2649 }
0a7de745 2650 return NULL;
39236c6e
A
2651 }
2652
2653 ip = mtod(m, struct ip *);
2654 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2655
2656 lck_mtx_lock(&ipqlock);
2657
2658 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
2659 head = &ipq[hash];
2660
2661 /*
2662 * Look for queue of fragments
2663 * of this datagram.
2664 */
2665 TAILQ_FOREACH(fp, head, ipq_list) {
2666 if (ip->ip_id == fp->ipq_id &&
2667 ip->ip_src.s_addr == fp->ipq_src.s_addr &&
2668 ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
2669#if CONFIG_MACF_NET
2670 mac_ipq_label_compare(m, fp) &&
2671#endif
0a7de745 2672 ip->ip_p == fp->ipq_p) {
39236c6e 2673 goto found;
0a7de745 2674 }
39236c6e
A
2675 }
2676
2677 fp = NULL;
2678
2679 /*
2680 * Attempt to trim the number of allocated fragment queues if it
2681 * exceeds the administrative limit.
2682 */
2683 if ((nipq > (unsigned)maxnipq) && (maxnipq > 0)) {
2684 /*
2685 * drop something from the tail of the current queue
2686 * before proceeding further
2687 */
2688 struct ipq *fq = TAILQ_LAST(head, ipqhead);
2689 if (fq == NULL) { /* gak */
2690 for (i = 0; i < IPREASS_NHASH; i++) {
2691 struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
2692 if (r) {
2693 ipstat.ips_fragtimeout += r->ipq_nfrags;
2694 frag_freef(&ipq[i], r);
2695 break;
2696 }
2697 }
2698 } else {
2699 ipstat.ips_fragtimeout += fq->ipq_nfrags;
2700 frag_freef(head, fq);
2701 }
2702 }
2703
2704found:
2705 /*
2706 * Leverage partial checksum offload for IP fragments. Narrow down
2707 * the scope to cover only UDP without IP options, as that is the
2708 * most common case.
2709 *
2710 * Perform 1's complement adjustment of octets that got included/
2711 * excluded in the hardware-calculated checksum value. Ignore cases
5ba3f43e
A
2712 * where the value includes the entire IPv4 header span, as the sum
2713 * for those octets would already be 0 by the time we get here; IP
2714 * has already performed its header checksum validation. Also take
2715 * care of any trailing bytes and subtract out their partial sum.
39236c6e 2716 */
0a7de745 2717 if (ip->ip_p == IPPROTO_UDP && hlen == sizeof(struct ip) &&
39236c6e
A
2718 (m->m_pkthdr.csum_flags &
2719 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
2720 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5ba3f43e
A
2721 uint32_t start = m->m_pkthdr.csum_rx_start;
2722 int32_t trailer = (m_pktlen(m) - ip->ip_len);
2723 uint32_t swbytes = (uint32_t)trailer;
39236c6e 2724
39236c6e 2725 csum = m->m_pkthdr.csum_rx_val;
1c79356b 2726
5ba3f43e
A
2727 ASSERT(trailer >= 0);
2728 if ((start != 0 && start != hlen) || trailer != 0) {
cb323159
A
2729 uint32_t datalen = ip->ip_len - hlen;
2730
39236c6e
A
2731#if BYTE_ORDER != BIG_ENDIAN
2732 if (start < hlen) {
2733 HTONS(ip->ip_len);
2734 HTONS(ip->ip_off);
2735 }
5ba3f43e 2736#endif /* BYTE_ORDER != BIG_ENDIAN */
39236c6e 2737 /* callee folds in sum */
cb323159 2738 csum = m_adj_sum16(m, start, hlen, datalen, csum);
0a7de745 2739 if (hlen > start) {
5ba3f43e 2740 swbytes += (hlen - start);
0a7de745 2741 } else {
5ba3f43e 2742 swbytes += (start - hlen);
0a7de745 2743 }
39236c6e
A
2744#if BYTE_ORDER != BIG_ENDIAN
2745 if (start < hlen) {
2746 NTOHS(ip->ip_off);
2747 NTOHS(ip->ip_len);
2748 }
5ba3f43e 2749#endif /* BYTE_ORDER != BIG_ENDIAN */
39236c6e
A
2750 }
2751 csum_flags = m->m_pkthdr.csum_flags;
5ba3f43e 2752
0a7de745 2753 if (swbytes != 0) {
5ba3f43e 2754 udp_in_cksum_stats(swbytes);
0a7de745
A
2755 }
2756 if (trailer != 0) {
5ba3f43e 2757 m_adj(m, -trailer);
0a7de745 2758 }
39236c6e
A
2759 } else {
2760 csum = 0;
2761 csum_flags = 0;
2762 }
2763
2764 /* Invalidate checksum */
2765 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
2766
2767 ipstat.ips_fragments++;
2768
2769 /*
2770 * Adjust ip_len to not reflect header,
2771 * convert offset of this to bytes.
2772 */
2773 ip->ip_len -= hlen;
2774 if (ip->ip_off & IP_MF) {
2775 /*
2776 * Make sure that fragments have a data length
2777 * that's a non-zero multiple of 8 bytes.
2778 */
2779 if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
2780 OSAddAtomic(1, &ipstat.ips_toosmall);
2781 /*
2782 * Reassembly queue may have been found if previous
2783 * fragments were valid; given that this one is bad,
2784 * we need to drop it. Make sure to set fp to NULL
2785 * if not already, since we don't want to decrement
2786 * ipq_nfrags as it doesn't include this packet.
2787 */
2788 fp = NULL;
2789 goto dropfrag;
2790 }
2791 m->m_flags |= M_FRAG;
2792 } else {
2793 /* Clear the flag in case packet comes from loopback */
2794 m->m_flags &= ~M_FRAG;
2795 }
2796 ip->ip_off <<= 3;
2797
2798 m->m_pkthdr.pkt_hdr = ip;
2799
2800 /* Previous ip_reass() started here. */
1c79356b
A
2801 /*
2802 * Presence of header sizes in mbufs
2803 * would confuse code below.
2804 */
2805 m->m_data += hlen;
2806 m->m_len -= hlen;
2807
2808 /*
2809 * If first fragment to arrive, create a reassembly queue.
2810 */
39236c6e
A
2811 if (fp == NULL) {
2812 fp = ipq_alloc(M_DONTWAIT);
0a7de745 2813 if (fp == NULL) {
1c79356b 2814 goto dropfrag;
0a7de745 2815 }
2d21ac55
A
2816#if CONFIG_MACF_NET
2817 if (mac_ipq_label_init(fp, M_NOWAIT) != 0) {
39236c6e 2818 ipq_free(fp);
2d21ac55
A
2819 fp = NULL;
2820 goto dropfrag;
2821 }
2822 mac_ipq_label_associate(m, fp);
2823#endif
39236c6e 2824 TAILQ_INSERT_HEAD(head, fp, ipq_list);
1c79356b 2825 nipq++;
483a1d10 2826 fp->ipq_nfrags = 1;
1c79356b
A
2827 fp->ipq_ttl = IPFRAGTTL;
2828 fp->ipq_p = ip->ip_p;
2829 fp->ipq_id = ip->ip_id;
2830 fp->ipq_src = ip->ip_src;
2831 fp->ipq_dst = ip->ip_dst;
2832 fp->ipq_frags = m;
2833 m->m_nextpkt = NULL;
39236c6e
A
2834 /*
2835 * If the first fragment has valid checksum offload
2836 * info, the rest of fragments are eligible as well.
2837 */
2838 if (csum_flags != 0) {
2839 fp->ipq_csum = csum;
2840 fp->ipq_csum_flags = csum_flags;
2841 }
1c79356b 2842#if IPDIVERT
39236c6e
A
2843 /*
2844 * Transfer firewall instructions to the fragment structure.
2845 * Only trust info in the fragment at offset 0.
2846 */
2847 if (ip->ip_off == 0) {
9bccf70c 2848#ifdef IPDIVERT_44
39236c6e 2849 fp->ipq_div_info = *divinfo;
9bccf70c 2850#else
39236c6e 2851 fp->ipq_divert = *divinfo;
9bccf70c 2852#endif
39236c6e
A
2853 fp->ipq_div_cookie = *divcookie;
2854 }
2855 *divinfo = 0;
2856 *divcookie = 0;
2857#endif /* IPDIVERT */
0a7de745 2858 m = NULL; /* nothing to return */
39236c6e 2859 goto done;
483a1d10
A
2860 } else {
2861 fp->ipq_nfrags++;
2d21ac55
A
2862#if CONFIG_MACF_NET
2863 mac_ipq_label_update(m, fp);
2864#endif
1c79356b
A
2865 }
2866
0a7de745 2867#define GETIP(m) ((struct ip *)((m)->m_pkthdr.pkt_hdr))
1c79356b 2868
2d21ac55
A
2869 /*
2870 * Handle ECN by comparing this segment with the first one;
2871 * if CE is set, do not lose CE.
2872 * drop if CE and not-ECT are mixed for the same packet.
2873 */
2874 ecn = ip->ip_tos & IPTOS_ECN_MASK;
2875 ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
2876 if (ecn == IPTOS_ECN_CE) {
0a7de745 2877 if (ecn0 == IPTOS_ECN_NOTECT) {
2d21ac55 2878 goto dropfrag;
0a7de745
A
2879 }
2880 if (ecn0 != IPTOS_ECN_CE) {
2d21ac55 2881 GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
0a7de745 2882 }
2d21ac55 2883 }
0a7de745 2884 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
2d21ac55 2885 goto dropfrag;
0a7de745 2886 }
2d21ac55 2887
1c79356b
A
2888 /*
2889 * Find a segment which begins after this one does.
2890 */
0a7de745
A
2891 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2892 if (GETIP(q)->ip_off > ip->ip_off) {
1c79356b 2893 break;
0a7de745
A
2894 }
2895 }
1c79356b
A
2896
2897 /*
2898 * If there is a preceding segment, it may provide some of
2899 * our data already. If so, drop the data from the incoming
2900 * segment. If it provides all of our data, drop us, otherwise
2901 * stick new segment in the proper place.
9bccf70c 2902 *
39236c6e 2903 * If some of the data is dropped from the preceding
9bccf70c 2904 * segment, then it's checksum is invalidated.
1c79356b
A
2905 */
2906 if (p) {
2907 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
2908 if (i > 0) {
0a7de745 2909 if (i >= ip->ip_len) {
1c79356b 2910 goto dropfrag;
0a7de745 2911 }
9bccf70c 2912 m_adj(m, i);
39236c6e 2913 fp->ipq_csum_flags = 0;
1c79356b
A
2914 ip->ip_off += i;
2915 ip->ip_len -= i;
2916 }
2917 m->m_nextpkt = p->m_nextpkt;
2918 p->m_nextpkt = m;
2919 } else {
2920 m->m_nextpkt = fp->ipq_frags;
2921 fp->ipq_frags = m;
2922 }
2923
2924 /*
2925 * While we overlap succeeding segments trim them or,
2926 * if they are completely covered, dequeue them.
2927 */
2928 for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
39236c6e
A
2929 q = nq) {
2930 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
1c79356b
A
2931 if (i < GETIP(q)->ip_len) {
2932 GETIP(q)->ip_len -= i;
2933 GETIP(q)->ip_off += i;
2934 m_adj(q, i);
39236c6e 2935 fp->ipq_csum_flags = 0;
1c79356b
A
2936 break;
2937 }
2938 nq = q->m_nextpkt;
2939 m->m_nextpkt = nq;
39236c6e 2940 ipstat.ips_fragdropped++;
483a1d10 2941 fp->ipq_nfrags--;
39236c6e
A
2942 /* defer freeing until after lock is dropped */
2943 MBUFQ_ENQUEUE(&dfq, q);
1c79356b
A
2944 }
2945
39236c6e
A
2946 /*
2947 * If this fragment contains similar checksum offload info
2948 * as that of the existing ones, accumulate checksum. Otherwise,
2949 * invalidate checksum offload info for the entire datagram.
2950 */
0a7de745 2951 if (csum_flags != 0 && csum_flags == fp->ipq_csum_flags) {
39236c6e 2952 fp->ipq_csum += csum;
0a7de745 2953 } else if (fp->ipq_csum_flags != 0) {
39236c6e 2954 fp->ipq_csum_flags = 0;
0a7de745 2955 }
1c79356b
A
2956
2957#if IPDIVERT
2958 /*
9bccf70c 2959 * Transfer firewall instructions to the fragment structure.
483a1d10 2960 * Only trust info in the fragment at offset 0.
1c79356b 2961 */
483a1d10 2962 if (ip->ip_off == 0) {
9bccf70c 2963#ifdef IPDIVERT_44
39236c6e 2964 fp->ipq_div_info = *divinfo;
9bccf70c 2965#else
39236c6e 2966 fp->ipq_divert = *divinfo;
9bccf70c 2967#endif
39236c6e 2968 fp->ipq_div_cookie = *divcookie;
483a1d10 2969 }
9bccf70c
A
2970 *divinfo = 0;
2971 *divcookie = 0;
39236c6e 2972#endif /* IPDIVERT */
1c79356b
A
2973
2974 /*
483a1d10
A
2975 * Check for complete reassembly and perform frag per packet
2976 * limiting.
2977 *
2978 * Frag limiting is performed here so that the nth frag has
2979 * a chance to complete the packet before we drop the packet.
2980 * As a result, n+1 frags are actually allowed per packet, but
2981 * only n will ever be stored. (n = maxfragsperpacket.)
2982 *
1c79356b
A
2983 */
2984 next = 0;
2985 for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
483a1d10
A
2986 if (GETIP(q)->ip_off != next) {
2987 if (fp->ipq_nfrags > maxfragsperpacket) {
39236c6e
A
2988 ipstat.ips_fragdropped += fp->ipq_nfrags;
2989 frag_freef(head, fp);
483a1d10 2990 }
0a7de745 2991 m = NULL; /* nothing to return */
39236c6e 2992 goto done;
483a1d10 2993 }
1c79356b
A
2994 next += GETIP(q)->ip_len;
2995 }
2996 /* Make sure the last packet didn't have the IP_MF flag */
483a1d10
A
2997 if (p->m_flags & M_FRAG) {
2998 if (fp->ipq_nfrags > maxfragsperpacket) {
39236c6e
A
2999 ipstat.ips_fragdropped += fp->ipq_nfrags;
3000 frag_freef(head, fp);
483a1d10 3001 }
0a7de745 3002 m = NULL; /* nothing to return */
39236c6e 3003 goto done;
483a1d10 3004 }
1c79356b
A
3005
3006 /*
3007 * Reassembly is complete. Make sure the packet is a sane size.
3008 */
3009 q = fp->ipq_frags;
3010 ip = GETIP(q);
3011 if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
39236c6e
A
3012 ipstat.ips_toolong++;
3013 ipstat.ips_fragdropped += fp->ipq_nfrags;
3014 frag_freef(head, fp);
0a7de745 3015 m = NULL; /* nothing to return */
39236c6e 3016 goto done;
1c79356b
A
3017 }
3018
3019 /*
3020 * Concatenate fragments.
3021 */
3022 m = q;
3023 t = m->m_next;
39236c6e 3024 m->m_next = NULL;
1c79356b
A
3025 m_cat(m, t);
3026 nq = q->m_nextpkt;
39236c6e 3027 q->m_nextpkt = NULL;
1c79356b
A
3028 for (q = nq; q != NULL; q = nq) {
3029 nq = q->m_nextpkt;
3030 q->m_nextpkt = NULL;
3031 m_cat(m, q);
3032 }
3033
39236c6e
A
3034 /*
3035 * Store partial hardware checksum info from the fragment queue;
3036 * the receive start offset is set to 20 bytes (see code at the
3037 * top of this routine.)
3038 */
3039 if (fp->ipq_csum_flags != 0) {
3040 csum = fp->ipq_csum;
3041
3042 ADDCARRY(csum);
3043
3044 m->m_pkthdr.csum_rx_val = csum;
0a7de745 3045 m->m_pkthdr.csum_rx_start = sizeof(struct ip);
39236c6e
A
3046 m->m_pkthdr.csum_flags = fp->ipq_csum_flags;
3047 } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
3048 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
3049 /* loopback checksums are always OK */
3050 m->m_pkthdr.csum_data = 0xffff;
39236c6e
A
3051 m->m_pkthdr.csum_flags =
3052 CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3053 CSUM_IP_CHECKED | CSUM_IP_VALID;
3054 }
3055
1c79356b
A
3056#if IPDIVERT
3057 /*
9bccf70c 3058 * Extract firewall instructions from the fragment structure.
1c79356b 3059 */
9bccf70c
A
3060#ifdef IPDIVERT_44
3061 *divinfo = fp->ipq_div_info;
3062#else
3063 *divinfo = fp->ipq_divert;
3064#endif
3065 *divcookie = fp->ipq_div_cookie;
39236c6e 3066#endif /* IPDIVERT */
1c79356b 3067
2d21ac55
A
3068#if CONFIG_MACF_NET
3069 mac_mbuf_label_associate_ipq(fp, m);
3070 mac_ipq_label_destroy(fp);
3071#endif
1c79356b 3072 /*
39236c6e
A
3073 * Create header for new ip packet by modifying header of first
3074 * packet; dequeue and discard fragment reassembly header.
1c79356b
A
3075 * Make header visible.
3076 */
39236c6e 3077 ip->ip_len = (IP_VHL_HL(ip->ip_vhl) << 2) + next;
1c79356b
A
3078 ip->ip_src = fp->ipq_src;
3079 ip->ip_dst = fp->ipq_dst;
39236c6e 3080
0a7de745 3081 fp->ipq_frags = NULL; /* return to caller as 'm' */
39236c6e
A
3082 frag_freef(head, fp);
3083 fp = NULL;
3084
1c79356b
A
3085 m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
3086 m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
3087 /* some debugging cruft by sklower, below, will go away soon */
0a7de745 3088 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
39236c6e 3089 m_fixhdr(m);
0a7de745 3090 }
39236c6e
A
3091 ipstat.ips_reassembled++;
3092
3093 /* arm the purge timer if not already and if there's work to do */
3094 frag_sched_timeout();
3095 lck_mtx_unlock(&ipqlock);
3096 /* perform deferred free (if needed) now that lock is dropped */
0a7de745 3097 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 3098 MBUFQ_DRAIN(&dfq);
0a7de745 3099 }
39236c6e 3100 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 3101 return m;
1c79356b 3102
39236c6e
A
3103done:
3104 VERIFY(m == NULL);
3105 /* arm the purge timer if not already and if there's work to do */
3106 frag_sched_timeout();
3107 lck_mtx_unlock(&ipqlock);
3108 /* perform deferred free (if needed) */
0a7de745 3109 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 3110 MBUFQ_DRAIN(&dfq);
0a7de745 3111 }
39236c6e 3112 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 3113 return NULL;
39236c6e 3114
1c79356b
A
3115dropfrag:
3116#if IPDIVERT
9bccf70c
A
3117 *divinfo = 0;
3118 *divcookie = 0;
39236c6e
A
3119#endif /* IPDIVERT */
3120 ipstat.ips_fragdropped++;
0a7de745 3121 if (fp != NULL) {
483a1d10 3122 fp->ipq_nfrags--;
0a7de745 3123 }
39236c6e
A
3124 /* arm the purge timer if not already and if there's work to do */
3125 frag_sched_timeout();
3126 lck_mtx_unlock(&ipqlock);
1c79356b 3127 m_freem(m);
39236c6e 3128 /* perform deferred free (if needed) */
0a7de745 3129 if (!MBUFQ_EMPTY(&dfq)) {
39236c6e 3130 MBUFQ_DRAIN(&dfq);
0a7de745 3131 }
39236c6e 3132 VERIFY(MBUFQ_EMPTY(&dfq));
0a7de745 3133 return NULL;
1c79356b
A
3134#undef GETIP
3135}
3136
3137/*
3138 * Free a fragment reassembly header and all
3139 * associated datagrams.
3140 */
3141static void
39236c6e 3142frag_freef(struct ipqhead *fhp, struct ipq *fp)
1c79356b 3143{
5ba3f43e 3144 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
3145
3146 fp->ipq_nfrags = 0;
3147 if (fp->ipq_frags != NULL) {
3148 m_freem_list(fp->ipq_frags);
3149 fp->ipq_frags = NULL;
3150 }
3151 TAILQ_REMOVE(fhp, fp, ipq_list);
1c79356b 3152 nipq--;
39236c6e 3153 ipq_free(fp);
1c79356b
A
3154}
3155
3156/*
39236c6e 3157 * IP reassembly timer processing
1c79356b 3158 */
39236c6e
A
3159static void
3160frag_timeout(void *arg)
1c79356b 3161{
39236c6e 3162#pragma unused(arg)
2d21ac55 3163 struct ipq *fp;
1c79356b 3164 int i;
39236c6e
A
3165
3166 /*
3167 * Update coarse-grained networking timestamp (in sec.); the idea
3168 * is to piggy-back on the timeout callout to update the counter
3169 * returnable via net_uptime().
3170 */
3171 net_update_uptime();
3172
3173 lck_mtx_lock(&ipqlock);
1c79356b 3174 for (i = 0; i < IPREASS_NHASH; i++) {
0a7de745 3175 for (fp = TAILQ_FIRST(&ipq[i]); fp;) {
39236c6e
A
3176 struct ipq *fpp;
3177
3178 fpp = fp;
3179 fp = TAILQ_NEXT(fp, ipq_list);
3180 if (--fpp->ipq_ttl == 0) {
3181 ipstat.ips_fragtimeout += fpp->ipq_nfrags;
3182 frag_freef(&ipq[i], fpp);
1c79356b
A
3183 }
3184 }
3185 }
9bccf70c
A
3186 /*
3187 * If we are over the maximum number of fragments
3188 * (due to the limit being lowered), drain off
3189 * enough to get down to the new limit.
3190 */
39236c6e
A
3191 if (maxnipq >= 0 && nipq > (unsigned)maxnipq) {
3192 for (i = 0; i < IPREASS_NHASH; i++) {
3193 while (nipq > (unsigned)maxnipq &&
3194 !TAILQ_EMPTY(&ipq[i])) {
3195 ipstat.ips_fragdropped +=
3196 TAILQ_FIRST(&ipq[i])->ipq_nfrags;
3197 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
9bccf70c
A
3198 }
3199 }
3200 }
39236c6e
A
3201 /* re-arm the purge timer if there's work to do */
3202 frag_timeout_run = 0;
3203 frag_sched_timeout();
3204 lck_mtx_unlock(&ipqlock);
3205}
3206
3207static void
3208frag_sched_timeout(void)
3209{
5ba3f43e 3210 LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
3211
3212 if (!frag_timeout_run && nipq > 0) {
3213 frag_timeout_run = 1;
3214 timeout(frag_timeout, NULL, hz);
3215 }
1c79356b
A
3216}
3217
3218/*
3219 * Drain off all datagram fragments.
3220 */
39236c6e
A
3221static void
3222frag_drain(void)
1c79356b 3223{
39236c6e 3224 int i;
1c79356b 3225
39236c6e 3226 lck_mtx_lock(&ipqlock);
1c79356b 3227 for (i = 0; i < IPREASS_NHASH; i++) {
39236c6e
A
3228 while (!TAILQ_EMPTY(&ipq[i])) {
3229 ipstat.ips_fragdropped +=
3230 TAILQ_FIRST(&ipq[i])->ipq_nfrags;
3231 frag_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
1c79356b
A
3232 }
3233 }
39236c6e
A
3234 lck_mtx_unlock(&ipqlock);
3235}
3236
3237static struct ipq *
3238ipq_alloc(int how)
3239{
3240 struct mbuf *t;
3241 struct ipq *fp;
3242
3243 /*
3244 * See comments in ipq_updateparams(). Keep the count separate
3245 * from nipq since the latter represents the elements already
3246 * in the reassembly queues.
3247 */
0a7de745
A
3248 if (ipq_limit > 0 && ipq_count > ipq_limit) {
3249 return NULL;
3250 }
39236c6e
A
3251
3252 t = m_get(how, MT_FTABLE);
3253 if (t != NULL) {
3254 atomic_add_32(&ipq_count, 1);
3255 fp = mtod(t, struct ipq *);
0a7de745 3256 bzero(fp, sizeof(*fp));
39236c6e
A
3257 } else {
3258 fp = NULL;
3259 }
0a7de745 3260 return fp;
39236c6e
A
3261}
3262
3263static void
3264ipq_free(struct ipq *fp)
3265{
3266 (void) m_free(dtom(fp));
3267 atomic_add_32(&ipq_count, -1);
3268}
3269
3270/*
3271 * Drain callback
3272 */
3273void
3274ip_drain(void)
3275{
0a7de745
A
3276 frag_drain(); /* fragments */
3277 in_rtqdrain(); /* protocol cloned routes */
3278 in_arpdrain(NULL); /* cloned routes: ARP */
1c79356b
A
3279}
3280
3281/*
3282 * Do option processing on a datagram,
3283 * possibly discarding it if bad options are encountered,
3284 * or forwarding it if source-routed.
91447636
A
3285 * The pass argument is used when operating in the IPSTEALTH
3286 * mode to tell what options to process:
3287 * [LS]SRR (pass 0) or the others (pass 1).
3288 * The reason for as many as two passes is that when doing IPSTEALTH,
3289 * non-routing options should be processed only if the packet is for us.
1c79356b
A
3290 * Returns 1 if packet has been forwarded/freed,
3291 * 0 if the packet should be processed further.
3292 */
3293static int
39236c6e 3294ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
1c79356b 3295{
39236c6e 3296#pragma unused(pass)
2d21ac55
A
3297 struct ip *ip = mtod(m, struct ip *);
3298 u_char *cp;
3299 struct ip_timestamp *ipt;
3300 struct in_ifaddr *ia;
1c79356b
A
3301 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
3302 struct in_addr *sin, dst;
04b8595b 3303 u_int32_t ntime;
b0d623f7 3304 struct sockaddr_in ipaddr = {
cb323159
A
3305 .sin_len = sizeof(ipaddr),
3306 .sin_family = AF_INET,
3307 .sin_port = 0,
3308 .sin_addr = { .s_addr = 0 },
3309 .sin_zero = { 0, }
0a7de745 3310 };
1c79356b 3311
316670eb
A
3312 /* Expect 32-bit aligned data pointer on strict-align platforms */
3313 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3314
1c79356b
A
3315 dst = ip->ip_dst;
3316 cp = (u_char *)(ip + 1);
0a7de745 3317 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1c79356b
A
3318 for (; cnt > 0; cnt -= optlen, cp += optlen) {
3319 opt = cp[IPOPT_OPTVAL];
0a7de745 3320 if (opt == IPOPT_EOL) {
1c79356b 3321 break;
0a7de745
A
3322 }
3323 if (opt == IPOPT_NOP) {
1c79356b 3324 optlen = 1;
0a7de745
A
3325 } else {
3326 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
9bccf70c 3327 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1c79356b
A
3328 goto bad;
3329 }
3330 optlen = cp[IPOPT_OLEN];
0a7de745 3331 if (optlen < IPOPT_OLEN + sizeof(*cp) ||
39236c6e 3332 optlen > cnt) {
1c79356b
A
3333 code = &cp[IPOPT_OLEN] - (u_char *)ip;
3334 goto bad;
3335 }
3336 }
3337 switch (opt) {
1c79356b
A
3338 default:
3339 break;
3340
3341 /*
3342 * Source routing with record.
3343 * Find interface with current destination address.
3344 * If none on this machine then drop if strictly routed,
3345 * or do nothing if loosely routed.
3346 * Record interface address and bring up next address
3347 * component. If strictly routed make sure next
3348 * address is on directly accessible net.
3349 */
3350 case IPOPT_LSRR:
3351 case IPOPT_SSRR:
0a7de745 3352 if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
9bccf70c
A
3353 code = &cp[IPOPT_OLEN] - (u_char *)ip;
3354 goto bad;
3355 }
1c79356b
A
3356 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
3357 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3358 goto bad;
3359 }
3360 ipaddr.sin_addr = ip->ip_dst;
39236c6e
A
3361 ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr));
3362 if (ia == NULL) {
1c79356b
A
3363 if (opt == IPOPT_SSRR) {
3364 type = ICMP_UNREACH;
3365 code = ICMP_UNREACH_SRCFAIL;
3366 goto bad;
3367 }
0a7de745 3368 if (!ip_dosourceroute) {
1c79356b 3369 goto nosourcerouting;
0a7de745 3370 }
1c79356b
A
3371 /*
3372 * Loose routing, and not at next destination
3373 * yet; nothing to do except forward.
3374 */
3375 break;
39236c6e 3376 } else {
6d2010ae 3377 IFA_REMREF(&ia->ia_ifa);
91447636
A
3378 ia = NULL;
3379 }
0a7de745
A
3380 off--; /* 0 origin */
3381 if (off > optlen - (int)sizeof(struct in_addr)) {
1c79356b
A
3382 /*
3383 * End of source route. Should be for us.
3384 */
0a7de745 3385 if (!ip_acceptsourceroute) {
1c79356b 3386 goto nosourcerouting;
0a7de745 3387 }
1c79356b
A
3388 save_rte(cp, ip->ip_src);
3389 break;
3390 }
3391
3392 if (!ip_dosourceroute) {
3393 if (ipforwarding) {
91447636
A
3394 char buf[MAX_IPv4_STR_LEN];
3395 char buf2[MAX_IPv4_STR_LEN];
1c79356b
A
3396 /*
3397 * Acting as a router, so generate ICMP
3398 */
3399nosourcerouting:
91447636 3400 log(LOG_WARNING,
39236c6e
A
3401 "attempted source route from %s "
3402 "to %s\n",
3403 inet_ntop(AF_INET, &ip->ip_src,
0a7de745 3404 buf, sizeof(buf)),
39236c6e 3405 inet_ntop(AF_INET, &ip->ip_dst,
0a7de745 3406 buf2, sizeof(buf2)));
1c79356b
A
3407 type = ICMP_UNREACH;
3408 code = ICMP_UNREACH_SRCFAIL;
3409 goto bad;
3410 } else {
3411 /*
39236c6e
A
3412 * Not acting as a router,
3413 * so silently drop.
1c79356b 3414 */
b0d623f7 3415 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b 3416 m_freem(m);
0a7de745 3417 return 1;
1c79356b
A
3418 }
3419 }
3420
3421 /*
3422 * locate outgoing interface
3423 */
39236c6e 3424 (void) memcpy(&ipaddr.sin_addr, cp + off,
0a7de745 3425 sizeof(ipaddr.sin_addr));
1c79356b
A
3426
3427 if (opt == IPOPT_SSRR) {
0a7de745 3428#define INA struct in_ifaddr *
316670eb 3429 if ((ia = (INA)ifa_ifwithdstaddr(
0a7de745 3430 SA(&ipaddr))) == NULL) {
39236c6e 3431 ia = (INA)ifa_ifwithnet(SA(&ipaddr));
91447636
A
3432 }
3433 } else {
b0d623f7 3434 ia = ip_rtaddr(ipaddr.sin_addr);
91447636 3435 }
39236c6e 3436 if (ia == NULL) {
1c79356b
A
3437 type = ICMP_UNREACH;
3438 code = ICMP_UNREACH_SRCFAIL;
3439 goto bad;
3440 }
3441 ip->ip_dst = ipaddr.sin_addr;
6d2010ae 3442 IFA_LOCK(&ia->ia_ifa);
39236c6e 3443 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
0a7de745 3444 sizeof(struct in_addr));
6d2010ae
A
3445 IFA_UNLOCK(&ia->ia_ifa);
3446 IFA_REMREF(&ia->ia_ifa);
91447636 3447 ia = NULL;
0a7de745 3448 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1c79356b
A
3449 /*
3450 * Let ip_intr's mcast routing check handle mcast pkts
3451 */
3452 forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
3453 break;
3454
3455 case IPOPT_RR:
0a7de745 3456 if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1c79356b
A
3457 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3458 goto bad;
3459 }
3460 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
3461 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
3462 goto bad;
3463 }
3464 /*
3465 * If no space remains, ignore.
3466 */
0a7de745
A
3467 off--; /* 0 origin */
3468 if (off > optlen - (int)sizeof(struct in_addr)) {
1c79356b 3469 break;
0a7de745 3470 }
39236c6e 3471 (void) memcpy(&ipaddr.sin_addr, &ip->ip_dst,
0a7de745 3472 sizeof(ipaddr.sin_addr));
1c79356b
A
3473 /*
3474 * locate outgoing interface; if we're the destination,
3475 * use the incoming interface (should be same).
3476 */
39236c6e
A
3477 if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) {
3478 if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
91447636
A
3479 type = ICMP_UNREACH;
3480 code = ICMP_UNREACH_HOST;
3481 goto bad;
3482 }
1c79356b 3483 }
6d2010ae 3484 IFA_LOCK(&ia->ia_ifa);
39236c6e 3485 (void) memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
0a7de745 3486 sizeof(struct in_addr));
6d2010ae
A
3487 IFA_UNLOCK(&ia->ia_ifa);
3488 IFA_REMREF(&ia->ia_ifa);
91447636 3489 ia = NULL;
0a7de745 3490 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1c79356b
A
3491 break;
3492
3493 case IPOPT_TS:
3494 code = cp - (u_char *)ip;
316670eb 3495 ipt = (struct ip_timestamp *)(void *)cp;
9bccf70c
A
3496 if (ipt->ipt_len < 4 || ipt->ipt_len > 40) {
3497 code = (u_char *)&ipt->ipt_len - (u_char *)ip;
1c79356b 3498 goto bad;
9bccf70c
A
3499 }
3500 if (ipt->ipt_ptr < 5) {
3501 code = (u_char *)&ipt->ipt_ptr - (u_char *)ip;
3502 goto bad;
3503 }
3504 if (ipt->ipt_ptr >
0a7de745 3505 ipt->ipt_len - (int)sizeof(int32_t)) {
9bccf70c
A
3506 if (++ipt->ipt_oflw == 0) {
3507 code = (u_char *)&ipt->ipt_ptr -
3508 (u_char *)ip;
1c79356b 3509 goto bad;
9bccf70c 3510 }
1c79356b
A
3511 break;
3512 }
316670eb 3513 sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1);
1c79356b 3514 switch (ipt->ipt_flg) {
1c79356b
A
3515 case IPOPT_TS_TSONLY:
3516 break;
3517
3518 case IPOPT_TS_TSANDADDR:
0a7de745
A
3519 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3520 sizeof(struct in_addr) > ipt->ipt_len) {
9bccf70c
A
3521 code = (u_char *)&ipt->ipt_ptr -
3522 (u_char *)ip;
1c79356b 3523 goto bad;
9bccf70c 3524 }
1c79356b 3525 ipaddr.sin_addr = dst;
39236c6e
A
3526 ia = (INA)ifaof_ifpforaddr(SA(&ipaddr),
3527 m->m_pkthdr.rcvif);
0a7de745 3528 if (ia == NULL) {
1c79356b 3529 continue;
0a7de745 3530 }
6d2010ae 3531 IFA_LOCK(&ia->ia_ifa);
39236c6e 3532 (void) memcpy(sin, &IA_SIN(ia)->sin_addr,
0a7de745 3533 sizeof(struct in_addr));
6d2010ae 3534 IFA_UNLOCK(&ia->ia_ifa);
0a7de745 3535 ipt->ipt_ptr += sizeof(struct in_addr);
6d2010ae 3536 IFA_REMREF(&ia->ia_ifa);
91447636 3537 ia = NULL;
1c79356b
A
3538 break;
3539
3540 case IPOPT_TS_PRESPEC:
0a7de745
A
3541 if (ipt->ipt_ptr - 1 + sizeof(n_time) +
3542 sizeof(struct in_addr) > ipt->ipt_len) {
9bccf70c
A
3543 code = (u_char *)&ipt->ipt_ptr -
3544 (u_char *)ip;
1c79356b 3545 goto bad;
9bccf70c 3546 }
39236c6e 3547 (void) memcpy(&ipaddr.sin_addr, sin,
0a7de745 3548 sizeof(struct in_addr));
39236c6e 3549 if ((ia = (struct in_ifaddr *)ifa_ifwithaddr(
0a7de745 3550 SA(&ipaddr))) == NULL) {
1c79356b 3551 continue;
0a7de745 3552 }
6d2010ae 3553 IFA_REMREF(&ia->ia_ifa);
91447636 3554 ia = NULL;
0a7de745 3555 ipt->ipt_ptr += sizeof(struct in_addr);
1c79356b
A
3556 break;
3557
3558 default:
9bccf70c
A
3559 /* XXX can't take &ipt->ipt_flg */
3560 code = (u_char *)&ipt->ipt_ptr -
3561 (u_char *)ip + 1;
1c79356b
A
3562 goto bad;
3563 }
3564 ntime = iptime();
39236c6e 3565 (void) memcpy(cp + ipt->ipt_ptr - 1, &ntime,
0a7de745
A
3566 sizeof(n_time));
3567 ipt->ipt_ptr += sizeof(n_time);
1c79356b
A
3568 }
3569 }
3570 if (forward && ipforwarding) {
b0d623f7 3571 ip_forward(m, 1, next_hop);
0a7de745 3572 return 1;
1c79356b 3573 }
0a7de745 3574 return 0;
1c79356b 3575bad:
1c79356b 3576 icmp_error(m, type, code, 0, 0);
b0d623f7 3577 OSAddAtomic(1, &ipstat.ips_badoptions);
0a7de745 3578 return 1;
1c79356b
A
3579}
3580
39236c6e
A
3581/*
3582 * Check for the presence of the IP Router Alert option [RFC2113]
3583 * in the header of an IPv4 datagram.
3584 *
3585 * This call is not intended for use from the forwarding path; it is here
3586 * so that protocol domains may check for the presence of the option.
3587 * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
3588 * option does not have much relevance to the implementation, though this
3589 * may change in future.
3590 * Router alert options SHOULD be passed if running in IPSTEALTH mode and
3591 * we are not the endpoint.
3592 * Length checks on individual options should already have been peformed
3593 * by ip_dooptions() therefore they are folded under DIAGNOSTIC here.
3594 *
3595 * Return zero if not present or options are invalid, non-zero if present.
3596 */
3597int
3598ip_checkrouteralert(struct mbuf *m)
3599{
3600 struct ip *ip = mtod(m, struct ip *);
3601 u_char *cp;
3602 int opt, optlen, cnt, found_ra;
3603
3604 found_ra = 0;
3605 cp = (u_char *)(ip + 1);
0a7de745 3606 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
39236c6e
A
3607 for (; cnt > 0; cnt -= optlen, cp += optlen) {
3608 opt = cp[IPOPT_OPTVAL];
0a7de745 3609 if (opt == IPOPT_EOL) {
39236c6e 3610 break;
0a7de745
A
3611 }
3612 if (opt == IPOPT_NOP) {
39236c6e 3613 optlen = 1;
0a7de745 3614 } else {
39236c6e 3615#ifdef DIAGNOSTIC
0a7de745 3616 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
39236c6e 3617 break;
0a7de745 3618 }
39236c6e
A
3619#endif
3620 optlen = cp[IPOPT_OLEN];
3621#ifdef DIAGNOSTIC
0a7de745 3622 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
39236c6e 3623 break;
0a7de745 3624 }
39236c6e
A
3625#endif
3626 }
3627 switch (opt) {
3628 case IPOPT_RA:
3629#ifdef DIAGNOSTIC
0a7de745
A
3630 if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
3631 (*((uint16_t *)(void *)&cp[IPOPT_OFFSET]) != 0)) {
39236c6e 3632 break;
0a7de745 3633 } else
39236c6e 3634#endif
0a7de745 3635 found_ra = 1;
39236c6e
A
3636 break;
3637 default:
3638 break;
3639 }
3640 }
3641
0a7de745 3642 return found_ra;
39236c6e
A
3643}
3644
1c79356b
A
3645/*
3646 * Given address of next destination (final or next hop),
3647 * return internet address info of interface to be used to get there.
3648 */
91447636 3649struct in_ifaddr *
b0d623f7 3650ip_rtaddr(struct in_addr dst)
1c79356b 3651{
2d21ac55 3652 struct sockaddr_in *sin;
b0d623f7
A
3653 struct ifaddr *rt_ifa;
3654 struct route ro;
3655
0a7de745 3656 bzero(&ro, sizeof(ro));
39236c6e 3657 sin = SIN(&ro.ro_dst);
b0d623f7 3658 sin->sin_family = AF_INET;
0a7de745 3659 sin->sin_len = sizeof(*sin);
b0d623f7
A
3660 sin->sin_addr = dst;
3661
3662 rtalloc_ign(&ro, RTF_PRCLONING);
39236c6e
A
3663 if (ro.ro_rt == NULL) {
3664 ROUTE_RELEASE(&ro);
0a7de745 3665 return NULL;
39236c6e 3666 }
b0d623f7
A
3667
3668 RT_LOCK(ro.ro_rt);
0a7de745 3669 if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) {
6d2010ae 3670 IFA_ADDREF(rt_ifa);
0a7de745 3671 }
b0d623f7 3672 RT_UNLOCK(ro.ro_rt);
39236c6e 3673 ROUTE_RELEASE(&ro);
b0d623f7 3674
0a7de745 3675 return (struct in_ifaddr *)rt_ifa;
1c79356b
A
3676}
3677
3678/*
3679 * Save incoming source route for use in replies,
3680 * to be picked up later by ip_srcroute if the receiver is interested.
3681 */
3682void
2d21ac55 3683save_rte(u_char *option, struct in_addr dst)
1c79356b
A
3684{
3685 unsigned olen;
3686
3687 olen = option[IPOPT_OLEN];
3688#if DIAGNOSTIC
0a7de745 3689 if (ipprintfs) {
1c79356b 3690 printf("save_rte: olen %d\n", olen);
0a7de745 3691 }
1c79356b 3692#endif
0a7de745 3693 if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) {
1c79356b 3694 return;
0a7de745 3695 }
1c79356b 3696 bcopy(option, ip_srcrt.srcopt, olen);
0a7de745 3697 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1c79356b
A
3698 ip_srcrt.dst = dst;
3699}
3700
3701/*
3702 * Retrieve incoming source route for use in replies,
3703 * in the same form used by setsockopt.
3704 * The first hop is placed before the options, will be removed later.
3705 */
3706struct mbuf *
2d21ac55 3707ip_srcroute(void)
1c79356b 3708{
2d21ac55
A
3709 struct in_addr *p, *q;
3710 struct mbuf *m;
1c79356b 3711
0a7de745
A
3712 if (ip_nhops == 0) {
3713 return NULL;
3714 }
39236c6e 3715
1c79356b 3716 m = m_get(M_DONTWAIT, MT_HEADER);
0a7de745
A
3717 if (m == NULL) {
3718 return NULL;
3719 }
1c79356b 3720
0a7de745 3721#define OPTSIZ (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt))
1c79356b
A
3722
3723 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
0a7de745
A
3724 m->m_len = ip_nhops * sizeof(struct in_addr) +
3725 sizeof(struct in_addr) + OPTSIZ;
1c79356b 3726#if DIAGNOSTIC
0a7de745 3727 if (ipprintfs) {
1c79356b 3728 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
0a7de745 3729 }
1c79356b
A
3730#endif
3731
3732 /*
3733 * First save first hop for return route
3734 */
3735 p = &ip_srcrt.route[ip_nhops - 1];
3736 *(mtod(m, struct in_addr *)) = *p--;
3737#if DIAGNOSTIC
0a7de745 3738 if (ipprintfs) {
39236c6e
A
3739 printf(" hops %lx",
3740 (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr));
0a7de745 3741 }
1c79356b
A
3742#endif
3743
3744 /*
3745 * Copy option fields and padding (nop) to mbuf.
3746 */
3747 ip_srcrt.nop = IPOPT_NOP;
3748 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
0a7de745 3749 (void) memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
1c79356b 3750 &ip_srcrt.nop, OPTSIZ);
316670eb 3751 q = (struct in_addr *)(void *)(mtod(m, caddr_t) +
0a7de745 3752 sizeof(struct in_addr) + OPTSIZ);
1c79356b
A
3753#undef OPTSIZ
3754 /*
3755 * Record return path as an IP source route,
3756 * reversing the path (pointers are now aligned).
3757 */
3758 while (p >= ip_srcrt.route) {
3759#if DIAGNOSTIC
0a7de745 3760 if (ipprintfs) {
b0d623f7 3761 printf(" %lx", (u_int32_t)ntohl(q->s_addr));
0a7de745 3762 }
1c79356b
A
3763#endif
3764 *q++ = *p--;
3765 }
3766 /*
3767 * Last hop goes to final destination.
3768 */
3769 *q = ip_srcrt.dst;
3770#if DIAGNOSTIC
0a7de745 3771 if (ipprintfs) {
b0d623f7 3772 printf(" %lx\n", (u_int32_t)ntohl(q->s_addr));
0a7de745 3773 }
1c79356b 3774#endif
0a7de745 3775 return m;
1c79356b
A
3776}
3777
3778/*
5ba3f43e 3779 * Strip out IP options, at higher level protocol in the kernel.
1c79356b
A
3780 */
3781void
5ba3f43e 3782ip_stripoptions(struct mbuf *m)
1c79356b 3783{
2d21ac55 3784 int i;
1c79356b 3785 struct ip *ip = mtod(m, struct ip *);
2d21ac55 3786 caddr_t opts;
1c79356b
A
3787 int olen;
3788
316670eb
A
3789 /* Expect 32-bit aligned data pointer on strict-align platforms */
3790 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3791
5ba3f43e 3792 /* use bcopy() since it supports overlapping range */
0a7de745 3793 olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1c79356b 3794 opts = (caddr_t)(ip + 1);
0a7de745 3795 i = m->m_len - (sizeof(struct ip) + olen);
1c79356b
A
3796 bcopy(opts + olen, opts, (unsigned)i);
3797 m->m_len -= olen;
0a7de745 3798 if (m->m_flags & M_PKTHDR) {
1c79356b 3799 m->m_pkthdr.len -= olen;
0a7de745
A
3800 }
3801 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
5ba3f43e
A
3802
3803 /*
3804 * We expect ip_{off,len} to be in host order by now, and
3805 * that the original IP header length has been subtracted
3806 * out from ip_len. Temporarily adjust ip_len for checksum
3807 * recalculation, and restore it afterwards.
3808 */
0a7de745 3809 ip->ip_len += sizeof(struct ip);
5ba3f43e
A
3810
3811 /* recompute checksum now that IP header is smaller */
3812#if BYTE_ORDER != BIG_ENDIAN
3813 HTONS(ip->ip_len);
3814 HTONS(ip->ip_off);
3815#endif /* BYTE_ORDER != BIG_ENDIAN */
3816 ip->ip_sum = in_cksum_hdr(ip);
3817#if BYTE_ORDER != BIG_ENDIAN
3818 NTOHS(ip->ip_off);
3819 NTOHS(ip->ip_len);
3820#endif /* BYTE_ORDER != BIG_ENDIAN */
3821
0a7de745 3822 ip->ip_len -= sizeof(struct ip);
cb323159
A
3823
3824 /*
3825 * Given that we've just stripped IP options from the header,
3826 * we need to adjust the start offset accordingly if this
3827 * packet had gone thru partial checksum offload.
3828 */
3829 if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
3830 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
3831 if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) {
3832 /* most common case */
3833 m->m_pkthdr.csum_rx_start -= olen;
3834 } else {
3835 /* compute checksum in software instead */
3836 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3837 m->m_pkthdr.csum_data = 0;
3838 ipstat.ips_adj_hwcsum_clr++;
3839 }
3840 }
1c79356b
A
3841}
3842
3843u_char inetctlerrmap[PRC_NCMDS] = {
0a7de745
A
3844 0, 0, 0, 0,
3845 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
3846 ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
3847 EMSGSIZE, EHOSTUNREACH, 0, 0,
cb323159 3848 0, 0, EHOSTUNREACH, 0,
0a7de745 3849 ENOPROTOOPT, ECONNREFUSED
1c79356b
A
3850};
3851
b0d623f7
A
3852static int
3853sysctl_ipforwarding SYSCTL_HANDLER_ARGS
3854{
3855#pragma unused(arg1, arg2)
3856 int i, was_ipforwarding = ipforwarding;
3857
3858 i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
0a7de745
A
3859 if (i != 0 || req->newptr == USER_ADDR_NULL) {
3860 return i;
3861 }
b0d623f7
A
3862
3863 if (was_ipforwarding && !ipforwarding) {
3864 /* clean up IPv4 forwarding cached routes */
3865 ifnet_head_lock_shared();
3866 for (i = 0; i <= if_index; i++) {
3867 struct ifnet *ifp = ifindex2ifnet[i];
3868 if (ifp != NULL) {
6d2010ae 3869 lck_mtx_lock(&ifp->if_cached_route_lock);
39236c6e 3870 ROUTE_RELEASE(&ifp->if_fwd_route);
6d2010ae 3871 bzero(&ifp->if_fwd_route,
0a7de745 3872 sizeof(ifp->if_fwd_route));
6d2010ae 3873 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3874 }
3875 }
3876 ifnet_head_done();
3877 }
3878
0a7de745 3879 return 0;
b0d623f7
A
3880}
3881
3882/*
3883 * Similar to inp_route_{copyout,copyin} routines except that these copy
3884 * out the cached IPv4 forwarding route from struct ifnet instead of the
3885 * inpcb. See comments for those routines for explanations.
3886 */
3887static void
3888ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst)
3889{
3890 struct route *src = &ifp->if_fwd_route;
3891
6d2010ae
A
3892 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3893 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
b0d623f7
A
3894
3895 /* Minor sanity check */
0a7de745 3896 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
b0d623f7 3897 panic("%s: wrong or corrupted route: %p", __func__, src);
0a7de745 3898 }
b0d623f7 3899
0a7de745 3900 route_copyout(dst, src, sizeof(*dst));
b0d623f7 3901
6d2010ae 3902 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3903}
3904
3905static void
3906ip_fwd_route_copyin(struct ifnet *ifp, struct route *src)
3907{
3908 struct route *dst = &ifp->if_fwd_route;
3909
6d2010ae
A
3910 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
3911 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
b0d623f7
A
3912
3913 /* Minor sanity check */
0a7de745 3914 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
b0d623f7 3915 panic("%s: wrong or corrupted route: %p", __func__, src);
0a7de745 3916 }
b0d623f7 3917
0a7de745
A
3918 if (ifp->if_fwd_cacheok) {
3919 route_copyin(src, dst, sizeof(*src));
3920 }
b0d623f7 3921
6d2010ae 3922 lck_mtx_unlock(&ifp->if_cached_route_lock);
b0d623f7
A
3923}
3924
1c79356b
A
3925/*
3926 * Forward a packet. If some error occurs return the sender
3927 * an icmp packet. Note we can't always generate a meaningful
3928 * icmp message because icmp doesn't have a large enough repertoire
3929 * of codes and types.
3930 *
3931 * If not forwarding, just drop the packet. This could be confusing
3932 * if ipforwarding was zero but some routing protocol was advancing
3933 * us as a gateway to somewhere. However, we must let the routing
3934 * protocol deal with that.
3935 *
3936 * The srcrt parameter indicates whether the packet is being forwarded
3937 * via a source route.
3938 */
9bccf70c 3939static void
b0d623f7 3940ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
1c79356b 3941{
b0d623f7
A
3942#if !IPFIREWALL
3943#pragma unused(next_hop)
3944#endif
2d21ac55
A
3945 struct ip *ip = mtod(m, struct ip *);
3946 struct sockaddr_in *sin;
3947 struct rtentry *rt;
b0d623f7 3948 struct route fwd_rt;
1c79356b
A
3949 int error, type = 0, code = 0;
3950 struct mbuf *mcopy;
3951 n_long dest;
91447636 3952 struct in_addr pkt_dst;
39236c6e 3953 u_int32_t nextmtu = 0, len;
a39ff7e2 3954 struct ip_out_args ipoa;
39236c6e 3955 struct ifnet *rcvifp = m->m_pkthdr.rcvif;
a39ff7e2
A
3956
3957 bzero(&ipoa, sizeof(ipoa));
3958 ipoa.ipoa_boundif = IFSCOPE_NONE;
3959 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3960 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3961
39236c6e
A
3962#if IPSEC
3963 struct secpolicy *sp = NULL;
3964 int ipsecerror;
3965#endif /* IPSEC */
b0d623f7
A
3966#if PF
3967 struct pf_mtag *pf_mtag;
3968#endif /* PF */
1c79356b
A
3969
3970 dest = 0;
b0d623f7 3971#if IPFIREWALL
91447636
A
3972 /*
3973 * Cache the destination address of the packet; this may be
3974 * changed by use of 'ipfw fwd'.
3975 */
39236c6e
A
3976 pkt_dst = ((next_hop != NULL) ? next_hop->sin_addr : ip->ip_dst);
3977#else /* !IPFIREWALL */
b0d623f7 3978 pkt_dst = ip->ip_dst;
39236c6e 3979#endif /* !IPFIREWALL */
91447636 3980
1c79356b 3981#if DIAGNOSTIC
0a7de745 3982 if (ipprintfs) {
1c79356b 3983 printf("forward: src %lx dst %lx ttl %x\n",
b0d623f7 3984 (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr,
1c79356b 3985 ip->ip_ttl);
0a7de745 3986 }
1c79356b
A
3987#endif
3988
0a7de745 3989 if (m->m_flags & (M_BCAST | M_MCAST) || !in_canforward(pkt_dst)) {
b0d623f7 3990 OSAddAtomic(1, &ipstat.ips_cantforward);
1c79356b
A
3991 m_freem(m);
3992 return;
3993 }
9bccf70c
A
3994#if IPSTEALTH
3995 if (!ipstealth) {
39236c6e 3996#endif /* IPSTEALTH */
0a7de745
A
3997 if (ip->ip_ttl <= IPTTLDEC) {
3998 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
3999 dest, 0);
4000 return;
1c79356b 4001 }
0a7de745
A
4002#if IPSTEALTH
4003}
39236c6e 4004#endif /* IPSTEALTH */
1c79356b 4005
b0d623f7
A
4006#if PF
4007 pf_mtag = pf_find_mtag(m);
316670eb
A
4008 if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
4009 ipoa.ipoa_boundif = pf_mtag->pftag_rtableid;
4010 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
4011 }
b0d623f7
A
4012#endif /* PF */
4013
39236c6e
A
4014 ip_fwd_route_copyout(rcvifp, &fwd_rt);
4015
4016 sin = SIN(&fwd_rt.ro_dst);
4017 if (ROUTE_UNUSABLE(&fwd_rt) || pkt_dst.s_addr != sin->sin_addr.s_addr) {
4018 ROUTE_RELEASE(&fwd_rt);
b0d623f7 4019
1c79356b 4020 sin->sin_family = AF_INET;
0a7de745 4021 sin->sin_len = sizeof(*sin);
91447636 4022 sin->sin_addr = pkt_dst;
1c79356b 4023
6d2010ae 4024 rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif);
b0d623f7 4025 if (fwd_rt.ro_rt == NULL) {
1c79356b 4026 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
b0d623f7 4027 goto done;
1c79356b 4028 }
1c79356b 4029 }
b0d623f7 4030 rt = fwd_rt.ro_rt;
1c79356b
A
4031
4032 /*
9bccf70c
A
4033 * Save the IP header and at most 8 bytes of the payload,
4034 * in case we need to generate an ICMP message to the src.
4035 *
4036 * We don't use m_copy() because it might return a reference
4037 * to a shared cluster. Both this function and ip_output()
4038 * assume exclusive access to the IP header in `m', so any
4039 * data in a cluster may change before we reach icmp_error().
1c79356b 4040 */
9bccf70c
A
4041 MGET(mcopy, M_DONTWAIT, m->m_type);
4042 if (mcopy != NULL) {
4043 M_COPY_PKTHDR(mcopy, m);
4044 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
4045 (int)ip->ip_len);
4046 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
4047 }
4048
4049#if IPSTEALTH
4050 if (!ipstealth) {
39236c6e 4051#endif /* IPSTEALTH */
0a7de745 4052 ip->ip_ttl -= IPTTLDEC;
9bccf70c 4053#if IPSTEALTH
0a7de745 4054}
39236c6e 4055#endif /* IPSTEALTH */
1c79356b
A
4056
4057 /*
4058 * If forwarding packet using same interface that it came in on,
4059 * perhaps should send a redirect to sender to shortcut a hop.
4060 * Only send redirect if source is sending directly to us,
4061 * and if packet was not source routed (or has any options).
4062 * Also, don't send redirect if forwarding using a default route
4063 * or a route modified by a redirect.
4064 */
b0d623f7 4065 RT_LOCK_SPIN(rt);
1c79356b 4066 if (rt->rt_ifp == m->m_pkthdr.rcvif &&
0a7de745 4067 !(rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED)) &&
39236c6e 4068 satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
6d2010ae
A
4069 ipsendredirects && !srcrt && rt->rt_ifa != NULL) {
4070 struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa;
b0d623f7 4071 u_int32_t src = ntohl(ip->ip_src.s_addr);
1c79356b 4072
6d2010ae
A
4073 /* Become a regular mutex */
4074 RT_CONVERT_LOCK(rt);
4075 IFA_LOCK_SPIN(&ia->ia_ifa);
4076 if ((src & ia->ia_subnetmask) == ia->ia_subnet) {
0a7de745 4077 if (rt->rt_flags & RTF_GATEWAY) {
6d2010ae 4078 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
0a7de745 4079 } else {
6d2010ae 4080 dest = pkt_dst.s_addr;
0a7de745 4081 }
39236c6e
A
4082 /*
4083 * Router requirements says to only send
4084 * host redirects.
4085 */
6d2010ae
A
4086 type = ICMP_REDIRECT;
4087 code = ICMP_REDIRECT_HOST;
1c79356b 4088#if DIAGNOSTIC
0a7de745 4089 if (ipprintfs) {
39236c6e
A
4090 printf("redirect (%d) to %lx\n", code,
4091 (u_int32_t)dest);
0a7de745 4092 }
1c79356b
A
4093#endif
4094 }
6d2010ae 4095 IFA_UNLOCK(&ia->ia_ifa);
1c79356b 4096 }
b0d623f7 4097 RT_UNLOCK(rt);
1c79356b 4098
b0d623f7 4099#if IPFIREWALL
39236c6e 4100 if (next_hop != NULL) {
91447636
A
4101 /* Pass IPFORWARD info if available */
4102 struct m_tag *tag;
39236c6e 4103 struct ip_fwd_tag *ipfwd_tag;
b0d623f7 4104
6d2010ae 4105 tag = m_tag_create(KERNEL_MODULE_TAG_ID,
b0d623f7 4106 KERNEL_TAG_TYPE_IPFORWARD,
0a7de745 4107 sizeof(*ipfwd_tag), M_NOWAIT, m);
91447636
A
4108 if (tag == NULL) {
4109 error = ENOBUFS;
4110 m_freem(m);
b0d623f7 4111 goto done;
91447636 4112 }
b0d623f7 4113
0a7de745 4114 ipfwd_tag = (struct ip_fwd_tag *)(tag + 1);
91447636
A
4115 ipfwd_tag->next_hop = next_hop;
4116
4117 m_tag_prepend(m, tag);
4118 }
39236c6e
A
4119#endif /* IPFIREWALL */
4120
4121 /* Mark this packet as being forwarded from another interface */
4122 m->m_pkthdr.pkt_flags |= PKTF_FORWARDED;
4123 len = m_pktlen(m);
4124
4125 error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING | IP_OUTARGS,
4126 NULL, &ipoa);
b0d623f7
A
4127
4128 /* Refresh rt since the route could have changed while in IP */
4129 rt = fwd_rt.ro_rt;
4130
39236c6e 4131 if (error != 0) {
b0d623f7
A
4132 OSAddAtomic(1, &ipstat.ips_cantforward);
4133 } else {
39236c6e
A
4134 /*
4135 * Increment stats on the source interface; the ones
4136 * for destination interface has been taken care of
4137 * during output above by virtue of PKTF_FORWARDED.
4138 */
4139 rcvifp->if_fpackets++;
4140 rcvifp->if_fbytes += len;
4141
b0d623f7 4142 OSAddAtomic(1, &ipstat.ips_forward);
39236c6e 4143 if (type != 0) {
b0d623f7 4144 OSAddAtomic(1, &ipstat.ips_redirectsent);
39236c6e
A
4145 } else {
4146 if (mcopy != NULL) {
b0d623f7
A
4147 /*
4148 * If we didn't have to go thru ipflow and
4149 * the packet was successfully consumed by
4150 * ip_output, the mcopy is rather a waste;
4151 * this could be further optimized.
4152 */
1c79356b
A
4153 m_freem(mcopy);
4154 }
b0d623f7 4155 goto done;
1c79356b
A
4156 }
4157 }
0a7de745 4158 if (mcopy == NULL) {
b0d623f7 4159 goto done;
0a7de745 4160 }
1c79356b
A
4161
4162 switch (error) {
0a7de745 4163 case 0: /* forwarded, but need redirect */
1c79356b
A
4164 /* type, code set above */
4165 break;
4166
0a7de745 4167 case ENETUNREACH: /* shouldn't happen, checked above */
1c79356b
A
4168 case EHOSTUNREACH:
4169 case ENETDOWN:
4170 case EHOSTDOWN:
4171 default:
4172 type = ICMP_UNREACH;
4173 code = ICMP_UNREACH_HOST;
4174 break;
4175
4176 case EMSGSIZE:
4177 type = ICMP_UNREACH;
4178 code = ICMP_UNREACH_NEEDFRAG;
39236c6e
A
4179
4180 if (rt == NULL) {
4181 break;
4182 } else {
b0d623f7 4183 RT_LOCK_SPIN(rt);
0a7de745 4184 if (rt->rt_ifp != NULL) {
b0d623f7 4185 nextmtu = rt->rt_ifp->if_mtu;
0a7de745 4186 }
b0d623f7
A
4187 RT_UNLOCK(rt);
4188 }
39236c6e 4189#ifdef IPSEC
0a7de745 4190 if (ipsec_bypass) {
39236c6e 4191 break;
0a7de745 4192 }
39236c6e 4193
1c79356b
A
4194 /*
4195 * If the packet is routed over IPsec tunnel, tell the
4196 * originator the tunnel MTU.
4197 * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
4198 * XXX quickhack!!!
4199 */
39236c6e
A
4200 sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
4201 IP_FORWARDING, &ipsecerror);
1c79356b 4202
0a7de745 4203 if (sp == NULL) {
39236c6e 4204 break;
0a7de745 4205 }
b0d623f7 4206
39236c6e
A
4207 /*
4208 * find the correct route for outer IPv4
4209 * header, compute tunnel MTU.
4210 */
4211 nextmtu = 0;
1c79356b 4212
39236c6e
A
4213 if (sp->req != NULL &&
4214 sp->req->saidx.mode == IPSEC_MODE_TUNNEL) {
4215 struct secasindex saidx;
4216 struct secasvar *sav;
4217 struct route *ro;
4218 struct ip *ipm;
4219 int ipsechdr;
1c79356b 4220
39236c6e
A
4221 /* count IPsec header size */
4222 ipsechdr = ipsec_hdrsiz(sp);
4223
4224 ipm = mtod(mcopy, struct ip *);
0a7de745 4225 bcopy(&sp->req->saidx, &saidx, sizeof(saidx));
39236c6e
A
4226 saidx.mode = sp->req->saidx.mode;
4227 saidx.reqid = sp->req->saidx.reqid;
4228 sin = SIN(&saidx.src);
4229 if (sin->sin_len == 0) {
0a7de745 4230 sin->sin_len = sizeof(*sin);
39236c6e
A
4231 sin->sin_family = AF_INET;
4232 sin->sin_port = IPSEC_PORT_ANY;
4233 bcopy(&ipm->ip_src, &sin->sin_addr,
0a7de745 4234 sizeof(sin->sin_addr));
39236c6e
A
4235 }
4236 sin = SIN(&saidx.dst);
4237 if (sin->sin_len == 0) {
0a7de745 4238 sin->sin_len = sizeof(*sin);
39236c6e
A
4239 sin->sin_family = AF_INET;
4240 sin->sin_port = IPSEC_PORT_ANY;
4241 bcopy(&ipm->ip_dst, &sin->sin_addr,
0a7de745 4242 sizeof(sin->sin_addr));
39236c6e
A
4243 }
4244 sav = key_allocsa_policy(&saidx);
4245 if (sav != NULL) {
4246 lck_mtx_lock(sadb_mutex);
4247 if (sav->sah != NULL) {
5c9f4661 4248 ro = (struct route *)&sav->sah->sa_route;
39236c6e
A
4249 if (ro->ro_rt != NULL) {
4250 RT_LOCK(ro->ro_rt);
4251 if (ro->ro_rt->rt_ifp != NULL) {
4252 nextmtu = ro->ro_rt->
4253 rt_ifp->if_mtu;
4254 nextmtu -= ipsechdr;
2d21ac55 4255 }
39236c6e 4256 RT_UNLOCK(ro->ro_rt);
1c79356b
A
4257 }
4258 }
39236c6e
A
4259 key_freesav(sav, KEY_SADB_LOCKED);
4260 lck_mtx_unlock(sadb_mutex);
1c79356b
A
4261 }
4262 }
39236c6e
A
4263 key_freesp(sp, KEY_SADB_UNLOCKED);
4264#endif /* IPSEC */
1c79356b
A
4265 break;
4266
4267 case ENOBUFS:
39236c6e
A
4268 /*
4269 * A router should not generate ICMP_SOURCEQUENCH as
4270 * required in RFC1812 Requirements for IP Version 4 Routers.
4271 * Source quench could be a big problem under DoS attacks,
4272 * or if the underlying interface is rate-limited.
4273 * Those who need source quench packets may re-enable them
4274 * via the net.inet.ip.sendsourcequench sysctl.
4275 */
4276 if (ip_sendsourcequench == 0) {
4277 m_freem(mcopy);
4278 goto done;
4279 } else {
4280 type = ICMP_SOURCEQUENCH;
4281 code = 0;
4282 }
1c79356b 4283 break;
9bccf70c 4284
0a7de745 4285 case EACCES: /* ipfw denied packet */
9bccf70c 4286 m_freem(mcopy);
b0d623f7 4287 goto done;
1c79356b 4288 }
b0d623f7 4289
0a7de745 4290 if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) {
39236c6e 4291 OSAddAtomic(1, &ipstat.ips_cantfrag);
0a7de745 4292 }
39236c6e 4293
b0d623f7
A
4294 icmp_error(mcopy, type, code, dest, nextmtu);
4295done:
39236c6e 4296 ip_fwd_route_copyin(rcvifp, &fwd_rt);
1c79356b
A
4297}
4298
6d2010ae 4299int
39236c6e
A
4300ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
4301 struct mbuf *m)
1c79356b 4302{
6d2010ae 4303 *mp = NULL;
1c79356b
A
4304 if (inp->inp_socket->so_options & SO_TIMESTAMP) {
4305 struct timeval tv;
4306
39236c6e 4307 getmicrotime(&tv);
0a7de745 4308 mp = sbcreatecontrol_mbuf((caddr_t)&tv, sizeof(tv),
39236c6e 4309 SCM_TIMESTAMP, SOL_SOCKET, mp);
6d2010ae
A
4310 if (*mp == NULL) {
4311 goto no_mbufs;
4312 }
1c79356b 4313 }
39236c6e 4314 if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) {
6d2010ae
A
4315 uint64_t time;
4316
4317 time = mach_absolute_time();
0a7de745 4318 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
39236c6e 4319 SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp);
6d2010ae
A
4320 if (*mp == NULL) {
4321 goto no_mbufs;
4322 }
39236c6e 4323 }
d9a64523
A
4324 if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) {
4325 uint64_t time;
4326
4327 time = mach_continuous_time();
0a7de745
A
4328 mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof(time),
4329 SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp);
d9a64523
A
4330 if (*mp == NULL) {
4331 goto no_mbufs;
4332 }
4333 }
1c79356b 4334 if (inp->inp_flags & INP_RECVDSTADDR) {
39236c6e 4335 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst,
0a7de745 4336 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp);
6d2010ae
A
4337 if (*mp == NULL) {
4338 goto no_mbufs;
4339 }
1c79356b
A
4340 }
4341#ifdef notyet
39236c6e
A
4342 /*
4343 * XXX
1c79356b
A
4344 * Moving these out of udp_input() made them even more broken
4345 * than they already were.
4346 */
4347 /* options were tossed already */
4348 if (inp->inp_flags & INP_RECVOPTS) {
39236c6e 4349 mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above,
0a7de745 4350 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp);
6d2010ae
A
4351 if (*mp == NULL) {
4352 goto no_mbufs;
4353 }
1c79356b
A
4354 }
4355 /* ip_srcroute doesn't do what we want here, need to fix */
4356 if (inp->inp_flags & INP_RECVRETOPTS) {
39236c6e 4357 mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(),
0a7de745 4358 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp);
6d2010ae
A
4359 if (*mp == NULL) {
4360 goto no_mbufs;
4361 }
1c79356b 4362 }
39236c6e 4363#endif /* notyet */
1c79356b
A
4364 if (inp->inp_flags & INP_RECVIF) {
4365 struct ifnet *ifp;
39236c6e
A
4366 uint8_t sdlbuf[SOCK_MAXADDRLEN + 1];
4367 struct sockaddr_dl *sdl2 = SDL(&sdlbuf);
4368
4369 /*
4370 * Make sure to accomodate the largest possible
4371 * size of SA(if_lladdr)->sa_len.
4372 */
0a7de745 4373 _CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1));
1c79356b 4374
91447636 4375 ifnet_head_lock_shared();
6d2010ae
A
4376 if ((ifp = m->m_pkthdr.rcvif) != NULL &&
4377 ifp->if_index && (ifp->if_index <= if_index)) {
13fec989 4378 struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1];
39236c6e 4379 struct sockaddr_dl *sdp;
2d21ac55 4380
0a7de745 4381 if (!ifa || !ifa->ifa_addr) {
13fec989 4382 goto makedummy;
0a7de745 4383 }
2d21ac55 4384
6d2010ae 4385 IFA_LOCK_SPIN(ifa);
39236c6e 4386 sdp = SDL(ifa->ifa_addr);
1c79356b
A
4387 /*
4388 * Change our mind and don't try copy.
4389 */
39236c6e 4390 if (sdp->sdl_family != AF_LINK) {
6d2010ae 4391 IFA_UNLOCK(ifa);
1c79356b
A
4392 goto makedummy;
4393 }
39236c6e 4394 /* the above _CASSERT ensures sdl_len fits in sdlbuf */
1c79356b 4395 bcopy(sdp, sdl2, sdp->sdl_len);
6d2010ae 4396 IFA_UNLOCK(ifa);
1c79356b 4397 } else {
6d2010ae 4398makedummy:
39236c6e
A
4399 sdl2->sdl_len =
4400 offsetof(struct sockaddr_dl, sdl_data[0]);
1c79356b
A
4401 sdl2->sdl_family = AF_LINK;
4402 sdl2->sdl_index = 0;
4403 sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
4404 }
91447636 4405 ifnet_head_done();
39236c6e
A
4406 mp = sbcreatecontrol_mbuf((caddr_t)sdl2, sdl2->sdl_len,
4407 IP_RECVIF, IPPROTO_IP, mp);
6d2010ae
A
4408 if (*mp == NULL) {
4409 goto no_mbufs;
4410 }
1c79356b 4411 }
55e303ae 4412 if (inp->inp_flags & INP_RECVTTL) {
39236c6e 4413 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl,
0a7de745 4414 sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, mp);
6d2010ae
A
4415 if (*mp == NULL) {
4416 goto no_mbufs;
4417 }
4418 }
39236c6e 4419 if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) {
316670eb
A
4420 int tc = m_get_traffic_class(m);
4421
0a7de745 4422 mp = sbcreatecontrol_mbuf((caddr_t)&tc, sizeof(tc),
39236c6e 4423 SO_TRAFFIC_CLASS, SOL_SOCKET, mp);
6d2010ae
A
4424 if (*mp == NULL) {
4425 goto no_mbufs;
4426 }
4427 }
4428 if (inp->inp_flags & INP_PKTINFO) {
4429 struct in_pktinfo pi;
4430
0a7de745
A
4431 bzero(&pi, sizeof(struct in_pktinfo));
4432 bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr));
39236c6e
A
4433 pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ?
4434 m->m_pkthdr.rcvif->if_index : 0;
4435
4436 mp = sbcreatecontrol_mbuf((caddr_t)&pi,
0a7de745 4437 sizeof(struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, mp);
6d2010ae
A
4438 if (*mp == NULL) {
4439 goto no_mbufs;
4440 }
55e303ae 4441 }
813fb2f6
A
4442 if (inp->inp_flags & INP_RECVTOS) {
4443 mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_tos,
4444 sizeof(u_char), IP_RECVTOS, IPPROTO_IP, mp);
4445 if (*mp == NULL) {
4446 goto no_mbufs;
4447 }
4448 }
0a7de745 4449 return 0;
6d2010ae
A
4450
4451no_mbufs:
4452 ipstat.ips_pktdropcntrl++;
0a7de745 4453 return ENOBUFS;
1c79356b
A
4454}
4455
316670eb
A
4456static inline u_short
4457ip_cksum(struct mbuf *m, int hlen)
4458{
316670eb 4459 u_short sum;
316670eb
A
4460
4461 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
4462 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
39236c6e
A
4463 } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
4464 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
316670eb 4465 /*
39236c6e
A
4466 * The packet arrived on an interface which isn't capable
4467 * of performing IP header checksum; compute it now.
316670eb 4468 */
39236c6e 4469 sum = ip_cksum_hdr_in(m, hlen);
316670eb 4470 } else {
316670eb 4471 sum = 0;
39236c6e
A
4472 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
4473 CSUM_IP_CHECKED | CSUM_IP_VALID);
4474 m->m_pkthdr.csum_data = 0xffff;
316670eb
A
4475 }
4476
0a7de745 4477 if (sum != 0) {
316670eb 4478 OSAddAtomic(1, &ipstat.ips_badsum);
0a7de745 4479 }
39236c6e 4480
0a7de745 4481 return sum;
39236c6e
A
4482}
4483
4484static int
4485ip_getstat SYSCTL_HANDLER_ARGS
4486{
4487#pragma unused(oidp, arg1, arg2)
0a7de745
A
4488 if (req->oldptr == USER_ADDR_NULL) {
4489 req->oldlen = (size_t)sizeof(struct ipstat);
4490 }
39236c6e 4491
0a7de745 4492 return SYSCTL_OUT(req, &ipstat, MIN(sizeof(ipstat), req->oldlen));
39236c6e
A
4493}
4494
4495void
4496ip_setsrcifaddr_info(struct mbuf *m, uint32_t src_idx, struct in_ifaddr *ia)
4497{
4498 VERIFY(m->m_flags & M_PKTHDR);
4499
4500 /*
4501 * If the source ifaddr is specified, pick up the information
4502 * from there; otherwise just grab the passed-in ifindex as the
4503 * caller may not have the ifaddr available.
4504 */
4505 if (ia != NULL) {
4506 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4507 m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index;
4508 } else {
4509 m->m_pkthdr.src_ifindex = src_idx;
0a7de745 4510 if (src_idx != 0) {
39236c6e 4511 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
0a7de745 4512 }
39236c6e
A
4513 }
4514}
4515
4516void
4517ip_setdstifaddr_info(struct mbuf *m, uint32_t dst_idx, struct in_ifaddr *ia)
4518{
4519 VERIFY(m->m_flags & M_PKTHDR);
4520
4521 /*
4522 * If the destination ifaddr is specified, pick up the information
4523 * from there; otherwise just grab the passed-in ifindex as the
4524 * caller may not have the ifaddr available.
4525 */
4526 if (ia != NULL) {
4527 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
4528 m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index;
4529 } else {
4530 m->m_pkthdr.dst_ifindex = dst_idx;
0a7de745 4531 if (dst_idx != 0) {
39236c6e 4532 m->m_pkthdr.pkt_flags |= PKTF_IFAINFO;
0a7de745 4533 }
39236c6e
A
4534 }
4535}
4536
4537int
4538ip_getsrcifaddr_info(struct mbuf *m, uint32_t *src_idx, uint32_t *iaf)
4539{
4540 VERIFY(m->m_flags & M_PKTHDR);
4541
0a7de745
A
4542 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4543 return -1;
4544 }
39236c6e 4545
0a7de745 4546 if (src_idx != NULL) {
39236c6e 4547 *src_idx = m->m_pkthdr.src_ifindex;
0a7de745 4548 }
39236c6e 4549
0a7de745 4550 if (iaf != NULL) {
39236c6e 4551 *iaf = 0;
0a7de745 4552 }
39236c6e 4553
0a7de745 4554 return 0;
39236c6e
A
4555}
4556
4557int
4558ip_getdstifaddr_info(struct mbuf *m, uint32_t *dst_idx, uint32_t *iaf)
4559{
4560 VERIFY(m->m_flags & M_PKTHDR);
4561
0a7de745
A
4562 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4563 return -1;
4564 }
39236c6e 4565
0a7de745 4566 if (dst_idx != NULL) {
39236c6e 4567 *dst_idx = m->m_pkthdr.dst_ifindex;
0a7de745 4568 }
39236c6e 4569
0a7de745 4570 if (iaf != NULL) {
39236c6e 4571 *iaf = 0;
0a7de745 4572 }
39236c6e 4573
0a7de745 4574 return 0;
39236c6e
A
4575}
4576
4577/*
4578 * Protocol input handler for IPPROTO_GRE.
4579 */
4580void
4581gre_input(struct mbuf *m, int off)
4582{
4583 gre_input_func_t fn = gre_input_func;
4584
4585 /*
4586 * If there is a registered GRE input handler, pass mbuf to it.
4587 */
4588 if (fn != NULL) {
4589 lck_mtx_unlock(inet_domain_mutex);
4590 m = fn(m, off, (mtod(m, struct ip *))->ip_p);
4591 lck_mtx_lock(inet_domain_mutex);
316670eb
A
4592 }
4593
39236c6e
A
4594 /*
4595 * If no matching tunnel that is up is found, we inject
4596 * the mbuf to raw ip socket to see if anyone picks it up.
4597 */
0a7de745 4598 if (m != NULL) {
39236c6e 4599 rip_input(m, off);
0a7de745 4600 }
39236c6e
A
4601}
4602
4603/*
4604 * Private KPI for PPP/PPTP.
4605 */
4606int
4607ip_gre_register_input(gre_input_func_t fn)
4608{
4609 lck_mtx_lock(inet_domain_mutex);
4610 gre_input_func = fn;
4611 lck_mtx_unlock(inet_domain_mutex);
4612
0a7de745 4613 return 0;
316670eb 4614}
3e170ce0 4615
39037602 4616#if (DEBUG || DEVELOPMENT)
3e170ce0
A
4617static int
4618sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS
4619{
4620#pragma unused(arg1, arg2)
4621 int error, i;
4622
4623 i = ip_input_measure;
4624 error = sysctl_handle_int(oidp, &i, 0, req);
0a7de745 4625 if (error || req->newptr == USER_ADDR_NULL) {
3e170ce0 4626 goto done;
0a7de745 4627 }
3e170ce0
A
4628 /* impose bounds */
4629 if (i < 0 || i > 1) {
4630 error = EINVAL;
4631 goto done;
4632 }
4633 if (ip_input_measure != i && i == 1) {
4634 net_perf_initialize(&net_perf, ip_input_measure_bins);
4635 }
4636 ip_input_measure = i;
4637done:
0a7de745 4638 return error;
3e170ce0
A
4639}
4640
4641static int
4642sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS
4643{
4644#pragma unused(arg1, arg2)
4645 int error;
4646 uint64_t i;
4647
4648 i = ip_input_measure_bins;
4649 error = sysctl_handle_quad(oidp, &i, 0, req);
0a7de745 4650 if (error || req->newptr == USER_ADDR_NULL) {
3e170ce0 4651 goto done;
0a7de745 4652 }
3e170ce0
A
4653 /* validate data */
4654 if (!net_perf_validate_bins(i)) {
4655 error = EINVAL;
4656 goto done;
4657 }
4658 ip_input_measure_bins = i;
4659done:
0a7de745 4660 return error;
3e170ce0
A
4661}
4662
4663static int
4664sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS
4665{
4666#pragma unused(oidp, arg1, arg2)
0a7de745
A
4667 if (req->oldptr == USER_ADDR_NULL) {
4668 req->oldlen = (size_t)sizeof(struct ipstat);
4669 }
3e170ce0 4670
0a7de745 4671 return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3e170ce0 4672}
39037602 4673#endif /* (DEBUG || DEVELOPMENT) */